diff --git a/src/main/java/gov/nasa/pds/tools/validate/InMemoryRegistrar.java b/src/main/java/gov/nasa/pds/tools/validate/InMemoryRegistrar.java index 9b2e90ac3..cac8a1616 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/InMemoryRegistrar.java +++ b/src/main/java/gov/nasa/pds/tools/validate/InMemoryRegistrar.java @@ -16,6 +16,8 @@ import java.io.File; import java.net.MalformedURLException; import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -28,14 +30,24 @@ public class InMemoryRegistrar implements TargetRegistrar { private static Logger LOG = LoggerFactory.getLogger(InMemoryRegistrar.class); private ValidationTarget rootTarget; - private Map targets = new HashMap<>(); - private Map collections = new HashMap<>(); - private Map bundles = new HashMap<>(); - private Set referencedTargetLocations = new HashSet<>(); - private Map identifierDefinitions = new HashMap<>(); - private Map> identifierDefinitionsByLid = new HashMap<>(); - private Map identifierReferenceLocations = new HashMap<>(); - private Map> referencedIdentifiersByLid = new HashMap<>(); + private Map targets = new ConcurrentHashMap<>(); + private Map collections = new ConcurrentHashMap<>(); + private Map bundles = new ConcurrentHashMap<>(); + private Set referencedTargetLocations = ConcurrentHashMap.newKeySet(); + private Map identifierDefinitions = new ConcurrentHashMap<>(); + private Map> identifierDefinitionsByLid = new ConcurrentHashMap<>(); + private Map identifierReferenceLocations = new ConcurrentHashMap<>(); + private Map> referencedIdentifiersByLid = new ConcurrentHashMap<>(); + + // Parent-child index: maps parent location to list of direct child locations. + // The inner ArrayLists are only accessed under the instance monitor (both addTarget() + // and getChildTargets() are synchronized), so they are thread-safe despite being mutable. + private Map> childrenByParent = new ConcurrentHashMap<>(); + + // Count-by-type index: tracks target counts per TargetType. + // Uses AtomicInteger to avoid boxed Integer allocations on each merge() call. + private Map targetCountByType = new ConcurrentHashMap<>(); + private AtomicInteger labelCount = new AtomicInteger(0); @Override public ValidationTarget getRoot() { @@ -57,7 +69,33 @@ public synchronized void addTarget(String parentLocation, TargetType type, Strin this.collections.put(location, target); } + boolean isNew = !this.targets.containsKey(location); + + // Only update indexes for genuinely new targets to avoid duplicates. + // Invariant: a location is never re-registered with a different TargetType. + // Both RegisterTargets.java call sites derive the type from the location itself + // (via Utility.getTargetType), so re-registration with a different type cannot + // happen in practice. Log a warning if this invariant is ever violated. + if (!isNew) { + ValidationTarget existing = this.targets.get(location); + if (existing != null && !type.equals(existing.getType())) { + LOG.warn("addTarget(): location {} re-registered with type {} (was {})", + location, type, existing.getType()); + } + } + this.targets.put(location, target); + + if (isNew) { + // Index parent-child relationship for O(1) child lookups + if (parentLocation != null) { + childrenByParent.computeIfAbsent(parentLocation, k -> new ArrayList<>()).add(location); + } + + // Increment count-by-type index + targetCountByType.computeIfAbsent(type, k -> new AtomicInteger(0)).incrementAndGet(); + } + LOG.debug("addTarget(): location: {}, target: {}", location, target); } catch (MalformedURLException e) { // TODO Auto-generated catch block @@ -67,40 +105,39 @@ public synchronized void addTarget(String parentLocation, TargetType type, Strin @Override public synchronized Collection getChildTargets(ValidationTarget parent) { - List children = new ArrayList<>(); - String parentLocation = parent.getLocation() + File.separator; - - for (String targetLocation : targets.keySet()) { - if (targetLocation.startsWith(parentLocation) - && !targetLocation.substring(parentLocation.length()).contains(File.separator)) { - children.add(targets.get(targetLocation)); - } + List childLocations = childrenByParent.getOrDefault(parent.getLocation(), Collections.emptyList()); + List children = new ArrayList<>(childLocations.size()); + for (String loc : childLocations) { + ValidationTarget t = targets.get(loc); + if (t != null) children.add(t); } - Collections.sort(children); return children; } @Override - public synchronized boolean hasTarget(String targetLocation) { + public boolean hasTarget(String targetLocation) { return targets.containsKey(targetLocation); } @Override - public synchronized int getTargetCount(TargetType type) { - int count = 0; - - for (Map.Entry entry : targets.entrySet()) { - if (entry.getValue().getType() == type) { - ++count; - } - } - return count; + public int getTargetCount(TargetType type) { + AtomicInteger count = targetCountByType.get(type); + return count != null ? count.get() : 0; } @Override public synchronized void setTargetIsLabel(String location, boolean isLabel) { - targets.get(location).setLabel(isLabel); + ValidationTarget target = targets.get(location); + boolean wasLabel = target.isLabel(); + target.setLabel(isLabel); + + // Update label count index + if (isLabel && !wasLabel) { + labelCount.incrementAndGet(); + } else if (!isLabel && wasLabel) { + labelCount.decrementAndGet(); + } // Labels refer to themselves. if (isLabel) { @@ -109,15 +146,8 @@ public synchronized void setTargetIsLabel(String location, boolean isLabel) { } @Override - public synchronized int getLabelCount() { - int count = 0; - for (Map.Entry entry : targets.entrySet()) { - if (entry.getValue().isLabel()) { - ++count; - } - } - - return count; + public int getLabelCount() { + return labelCount.get(); } @Override @@ -126,7 +156,7 @@ public synchronized void setTargetIdentifier(String location, Identifier identif LOG.debug("setTargetIdentifier:identifier,location {},{}", identifier, location); identifierDefinitions.put(identifier, location); - identifierDefinitionsByLid.computeIfAbsent(identifier.getLid(), x -> new HashSet<>()).add(identifier); + identifierDefinitionsByLid.computeIfAbsent(identifier.getLid(), x -> ConcurrentHashMap.newKeySet()).add(identifier); } @Override @@ -135,7 +165,7 @@ public synchronized void addTargetReference(String referenceLocation, String tar } @Override - public synchronized boolean isTargetReferenced(String location) { + public boolean isTargetReferenced(String location) { return referencedTargetLocations.contains(location); } @@ -145,11 +175,11 @@ public synchronized void addIdentifierReference(String referenceLocation, Identi String lid = identifier.getLid(); - referencedIdentifiersByLid.computeIfAbsent(identifier.getLid(), x -> new HashSet<>()).add(identifier); + referencedIdentifiersByLid.computeIfAbsent(identifier.getLid(), x -> ConcurrentHashMap.newKeySet()).add(identifier); } @Override - public synchronized boolean isIdentifierReferenced(Identifier identifier, boolean orNearNeighbor) { + public boolean isIdentifierReferenced(Identifier identifier, boolean orNearNeighbor) { boolean result = identifierReferenceLocations.containsKey(identifier); if (!result && orNearNeighbor) { for (Identifier id : this.referencedIdentifiersByLid.getOrDefault(identifier.getLid(), Collections.emptySet())) { diff --git a/src/main/java/gov/nasa/pds/tools/validate/ValidationTarget.java b/src/main/java/gov/nasa/pds/tools/validate/ValidationTarget.java index bb366a02d..fcb59269a 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/ValidationTarget.java +++ b/src/main/java/gov/nasa/pds/tools/validate/ValidationTarget.java @@ -209,4 +209,13 @@ public boolean equals(Object obj) { public URL getUrl() { return url; } + + /** + * Clears the static cache of ValidationTargets to free memory between + * validation runs. Without this, cachedTargets grows unboundedly across + * runs and can cause OOM on large bundles. + */ + public static void clearCache() { + cachedTargets.clear(); + } } diff --git a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java index 00b802f72..55f0b5c84 100644 --- a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java +++ b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java @@ -110,6 +110,7 @@ import gov.nasa.pds.tools.validate.ValidateProblemHandler; import gov.nasa.pds.tools.validate.ValidationProblem; import gov.nasa.pds.tools.validate.ValidationResourceManager; +import gov.nasa.pds.tools.validate.ValidationTarget; import gov.nasa.pds.tools.validate.rule.pds4.SchemaValidator; import gov.nasa.pds.validate.checksum.ChecksumManifest; import gov.nasa.pds.validate.commandline.options.ConfigKey; @@ -1526,6 +1527,9 @@ public boolean doValidation(Map checksumManifest) throws Exception validator.validate(monitor, target); monitor.endValidation(); + // Free cached ValidationTargets to prevent OOM on large bundles + ValidationTarget.clearCache(); + if (validationRule != null) { // If the rule is pds4.label, clear out the list of Information Model Versions // except the first element. diff --git a/src/test/java/gov/nasa/pds/tools/validate/InMemoryRegistrarTest.java b/src/test/java/gov/nasa/pds/tools/validate/InMemoryRegistrarTest.java new file mode 100644 index 000000000..18fb89b91 --- /dev/null +++ b/src/test/java/gov/nasa/pds/tools/validate/InMemoryRegistrarTest.java @@ -0,0 +1,150 @@ +package gov.nasa.pds.tools.validate; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link InMemoryRegistrar}, focusing on the parent-child index + * and count-by-type caching introduced to eliminate O(n) scans. + */ +class InMemoryRegistrarTest { + + private InMemoryRegistrar registrar; + + // Use file:// URLs to match the format used in production + private static final String BUNDLE = "file:///data/bundle"; + private static final String COLLECTION_A = "file:///data/bundle/collection_a"; + private static final String COLLECTION_B = "file:///data/bundle/collection_b"; + private static final String PRODUCT_1 = "file:///data/bundle/collection_a/product1.xml"; + private static final String PRODUCT_2 = "file:///data/bundle/collection_a/product2.xml"; + private static final String PRODUCT_3 = "file:///data/bundle/collection_b/product3.xml"; + + @BeforeEach + void setUp() { + registrar = new InMemoryRegistrar(); + ValidationTarget.clearCache(); + } + + @Test + void testGetChildTargetsMultiLevelHierarchy() { + // Build a 3-level hierarchy: bundle -> collections -> products + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.COLLECTION, COLLECTION_A); + registrar.addTarget(BUNDLE, TargetType.COLLECTION, COLLECTION_B); + registrar.addTarget(COLLECTION_A, TargetType.FILE, PRODUCT_1); + registrar.addTarget(COLLECTION_A, TargetType.FILE, PRODUCT_2); + registrar.addTarget(COLLECTION_B, TargetType.FILE, PRODUCT_3); + + // Bundle's children should be only the two collections (not grandchildren) + ValidationTarget bundleTarget = registrar.getRoot(); + Collection bundleChildren = registrar.getChildTargets(bundleTarget); + List bundleChildLocations = bundleChildren.stream() + .map(ValidationTarget::getLocation) + .collect(Collectors.toList()); + + assertEquals(2, bundleChildLocations.size(), + "Bundle should have exactly 2 direct children (collections)"); + assertTrue(bundleChildLocations.contains(COLLECTION_A)); + assertTrue(bundleChildLocations.contains(COLLECTION_B)); + + // Collection A's children should be product1 and product2 + ValidationTarget collATarget = registrar.getTargets().get(COLLECTION_A); + Collection collAChildren = registrar.getChildTargets(collATarget); + List collAChildLocations = collAChildren.stream() + .map(ValidationTarget::getLocation) + .collect(Collectors.toList()); + + assertEquals(2, collAChildLocations.size(), + "Collection A should have exactly 2 direct children (products)"); + assertTrue(collAChildLocations.contains(PRODUCT_1)); + assertTrue(collAChildLocations.contains(PRODUCT_2)); + + // Collection B's children should be only product3 + ValidationTarget collBTarget = registrar.getTargets().get(COLLECTION_B); + Collection collBChildren = registrar.getChildTargets(collBTarget); + List collBChildLocations = collBChildren.stream() + .map(ValidationTarget::getLocation) + .collect(Collectors.toList()); + + assertEquals(1, collBChildLocations.size(), + "Collection B should have exactly 1 direct child"); + assertTrue(collBChildLocations.contains(PRODUCT_3)); + } + + @Test + void testGetChildTargetsLeafNodeReturnsEmpty() { + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); + + // Leaf node should have no children + ValidationTarget product = registrar.getTargets().get(PRODUCT_1); + Collection children = registrar.getChildTargets(product); + assertTrue(children.isEmpty(), "Leaf node should have no children"); + } + + @Test + void testDuplicateAddTargetDoesNotCreateDuplicateChildren() { + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); + // Add the same target again + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); + + ValidationTarget bundleTarget = registrar.getRoot(); + Collection children = registrar.getChildTargets(bundleTarget); + assertEquals(1, children.size(), + "Duplicate addTarget should not create duplicate children"); + } + + @Test + void testTargetCountByType() { + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.COLLECTION, COLLECTION_A); + registrar.addTarget(BUNDLE, TargetType.COLLECTION, COLLECTION_B); + registrar.addTarget(COLLECTION_A, TargetType.FILE, PRODUCT_1); + registrar.addTarget(COLLECTION_A, TargetType.FILE, PRODUCT_2); + registrar.addTarget(COLLECTION_B, TargetType.FILE, PRODUCT_3); + + assertEquals(1, registrar.getTargetCount(TargetType.BUNDLE)); + assertEquals(2, registrar.getTargetCount(TargetType.COLLECTION)); + assertEquals(3, registrar.getTargetCount(TargetType.FILE)); + assertEquals(0, registrar.getTargetCount(TargetType.DIRECTORY)); + } + + @Test + void testDuplicateAddTargetDoesNotInflateCount() { + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); // duplicate + + assertEquals(1, registrar.getTargetCount(TargetType.BUNDLE)); + assertEquals(1, registrar.getTargetCount(TargetType.FILE), + "Duplicate addTarget should not inflate count"); + } + + @Test + void testLabelCount() { + registrar.addTarget(null, TargetType.BUNDLE, BUNDLE); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_1); + registrar.addTarget(BUNDLE, TargetType.FILE, PRODUCT_2); + + assertEquals(0, registrar.getLabelCount()); + + registrar.setTargetIsLabel(PRODUCT_1, true); + assertEquals(1, registrar.getLabelCount()); + + registrar.setTargetIsLabel(PRODUCT_2, true); + assertEquals(2, registrar.getLabelCount()); + + // Setting the same target as label again should not double-count + registrar.setTargetIsLabel(PRODUCT_1, true); + assertEquals(2, registrar.getLabelCount(), + "Re-setting same label should not increase count"); + } +}