From bddab44c9e48a4db9be582e77f28e5db37a5c6c9 Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Mon, 18 May 2026 23:57:57 -0700 Subject: [PATCH 1/2] Cache label identifier data to eliminate redundant parsing (#1568) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After each label is parsed by pds4-jparser, extract and cache the logical identifiers, lid/lidvid references, and context area references into a LabelCacheEntry. In additionalReferentialIntegrityChecks(), use cached context area refs to skip three expensive Saxon XPath evaluations per label instead of re-running them against a freshly-reparsed DOM. Main identifiers (logicalIdentifiers, lidOrLidVidReferences) still re-parse from disk in additionalReferentialIntegrityChecks() to correctly detect and report INVALID_FIELD_VALUE for identifier values containing newlines — pds4-jparser normalizes newlines away, so the cached values cannot be used for that check. Also fixes CrossLabelFileAreaReferenceChecker.reset() to clear the isObservational map alongside knownRefs, preventing static state from leaking across validation runs. Co-Authored-By: Claude Sonnet 4 --- CLAUDE.md | 6 +-- .../nasa/pds/tools/util/LabelCacheEntry.java | 32 ++++++++++++ .../gov/nasa/pds/tools/util/LabelUtil.java | 16 +++++- .../tools/util/ReferentialIntegrityUtil.java | 50 +++++++++++++++++-- .../CrossLabelFileAreaReferenceChecker.java | 11 ++-- .../pds4/FileReferenceValidationRule.java | 2 +- .../rule/pds4/LabelValidationRule.java | 21 ++++++++ .../nasa/pds/validate/ValidateLauncher.java | 2 + 8 files changed, 123 insertions(+), 17 deletions(-) create mode 100644 src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java diff --git a/CLAUDE.md b/CLAUDE.md index d80512740..271937b7e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,10 +23,10 @@ mvn package site # Build without running tests (useful when tests fail with extra logging) mvn clean package -DskipTests -# Run tests only +# Run full test suite (always use this to verify changes) mvn test -# Run tests excluding ReferenceIntegrityTest with specific Cucumber tags +# Run a targeted subset during development (NOT for verifying changes — excludes ReferenceIntegrityTest) mvn test -Dtest=\!ReferenceIntegrityTest* -Dcucumber.filter.tags='@v3.7.x' # Run a single Cucumber scenario by name (issue number and subtest, e.g. #1458-1) @@ -172,7 +172,7 @@ Alternatively, use `--verbose 1` flag for INFO level output in reports (default - Source/Target: Java 17 (pom.xml:172-173) ### Main Dependencies -- **pds4-jparser** (version 2.11.0) - PDS4 label parsing and validation +- **pds4-jparser** (version 3.2.0-SNAPSHOT) - PDS4 label parsing and validation; **source is at `../pds4-jparser`** — check there first when tracing errors from pds4-jparser classes (e.g. `FieldValueValidator`, `ProblemType`, parser internals) - **pds3-product-tools** (version 4.4.2) - PDS3 validation - **Saxon-HE** (version 12.9) - XSLT and XPath processing - **registry-common** (version 2.1.0-SNAPSHOT) - Registry API integration diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java new file mode 100644 index 000000000..fb920467a --- /dev/null +++ b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java @@ -0,0 +1,32 @@ +package gov.nasa.pds.tools.util; + +import java.util.ArrayList; + +/** + * Immutable cache of identifier data extracted from a parsed label, used to avoid re-parsing + * during the referential integrity phase. + */ +public class LabelCacheEntry { + private final ArrayList logicalIdentifiers; + private final ArrayList lidOrLidVidReferences; + private final ArrayList contextAreaRefs; + + public LabelCacheEntry(ArrayList logicalIdentifiers, + ArrayList lidOrLidVidReferences, ArrayList contextAreaRefs) { + this.logicalIdentifiers = logicalIdentifiers; + this.lidOrLidVidReferences = lidOrLidVidReferences; + this.contextAreaRefs = contextAreaRefs; + } + + public ArrayList getLogicalIdentifiers() { + return logicalIdentifiers; + } + + public ArrayList getLidOrLidVidReferences() { + return lidOrLidVidReferences; + } + + public ArrayList getContextAreaRefs() { + return contextAreaRefs; + } +} diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java index f59486b55..a7698a0f1 100644 --- a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java @@ -365,6 +365,11 @@ public static synchronized ArrayList getIdentifiersCommon(DOMSource sour * @return lidOrLidVidReference The LID or LIDVID referenced in this label. */ public static ArrayList getLidVidReferences(DOMSource source, URL context) { + return getLidVidReferences(source, context, true); + } + + public static ArrayList getLidVidReferences(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLidVidReferences:MY_SOURCE[{}]", source); String[] tagsList = new String[2]; @@ -372,7 +377,8 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex tagsList[1] = LID_REFERENCE; ArrayList lidOrLidVidReferences = - LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA, + reportCarriageReturns); LOG.debug("getLidVidReferences:context,lidOrLidVidReferences {},{}", context, lidOrLidVidReferences); @@ -387,13 +393,19 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex * @return logicalIdentifiers A list of logical identifiers in this label. */ public static ArrayList getLogicalIdentifiers(DOMSource source, URL context) { + return getLogicalIdentifiers(source, context, true); + } + + public static ArrayList getLogicalIdentifiers(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLogicalIdentifiers:MY_SOURCE[{}]", source); String[] tagsList = new String[1]; tagsList[0] = LOGICAL_IDENTIFIER_TAG; ArrayList logicalIdentifiers = - LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA, + reportCarriageReturns); LOG.debug("getLogicalIdentifiers:context,logicalIdentifiers {},{}", context, logicalIdentifiers); diff --git a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java index 030b2ab01..e653dd6e7 100644 --- a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java @@ -142,6 +142,7 @@ public class ReferentialIntegrityUtil { private static String[] tagsList = new String[2]; private static HashSet reportedErrorsReferenceSet = new HashSet<>(); private static URL parentBundleURL = null; + private static HashMap labelIdentifierCache = new HashMap<>(); /** * Initialize this class to ready for doing referential checks. @@ -163,6 +164,14 @@ public static void initialize(String referenceType, URL target, ProblemListener ReferentialIntegrityUtil.tagsList[1] = LabelUtil.LID_REFERENCE; } + public static void cacheLabelIdentifiers(URL url, LabelCacheEntry entry) { + labelIdentifierCache.put(url, entry); + } + + public static LabelCacheEntry getCachedLabelIdentifiers(URL url) { + return labelIdentifierCache.get(url); + } + /** * Reset this class to its initial state in case running from regression tests. * @@ -188,6 +197,7 @@ public static void reset() { ReferentialIntegrityUtil.urlsParsedCumulative.clear(); ReferentialIntegrityUtil.reportedErrorsReferenceSet.clear(); ReferentialIntegrityUtil.parentBundleURL = null; + ReferentialIntegrityUtil.labelIdentifierCache.clear(); } /** @@ -577,6 +587,30 @@ private static void addUniqueReferencesToMap(HashMap allContextAreaRefs, + ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, + boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { + if ((logicalIdentifiers == null) || logicalIdentifiers.isEmpty()) { + return; + } + if (labelIsCollectionFlag || labelIsBundleFlag) { + if (labelIsBundleFlag) { + ReferentialIntegrityUtil.addUniqueReferencesToMap(ReferentialIntegrityUtil.bundleReferenceMap, + allContextAreaRefs, url, logicalIdentifiers.get(0)); + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.collectionReferenceMap, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.contextReferencesCumulative, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + LOG.debug("collectAllContextReferences:url,contextReferencesCumulative {},{},{}", url, + contextReferencesCumulative, contextReferencesCumulative.size()); + } + private static void collectAllContextReferences(DOMSource domSource, ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { @@ -786,6 +820,11 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun if (TargetExaminer.isTargetCollectionType (child.getUrl())) { labelIsCollectionFlag = true; } + // Always re-parse from disk to correctly detect \n in lid_reference/lidvid_reference + // elements and report INVALID_FIELD_VALUE errors. The pds4-jparser DOM (used for + // caching) normalizes \n away, so we cannot rely on cached identifier lists here. + // The cache is kept for the collectAllContextReferences optimization below. + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(url); xml = db.parse(url.openStream()); domSource = new DOMSource(xml); // Note that the function getLidVidReferences() collects all Internal_Reference @@ -793,7 +832,6 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun // Context_Area, discipline LDD areas, and any other location in the label). // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all // references collected in lidOrLidVidReferences for each label. - ArrayList lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); ArrayList logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url, @@ -871,8 +909,14 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun // Collect all the context references defined for each label under the // "Context_Area" tag. if (ReferentialIntegrityUtil.contextReferenceCheck) { - ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, - lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + if (cached != null) { + ReferentialIntegrityUtil.collectAllContextReferences(cached.getContextAreaRefs(), + logicalIdentifiers, lidOrLidVidReferences, labelIsBundleFlag, + labelIsCollectionFlag, url); + } else { + ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, + lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + } } } else { diff --git a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java index 9832e102f..e0c01c6b2 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java +++ b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java @@ -26,15 +26,9 @@ private static String resolve (String name, ValidationTarget target) throws URIS * @param name - the file being referenced by the file area * @param target - the label being validated * @return true if name and target are unique and only known references and false otherwise - * All of these exceptions should not happen because they are getting the LID from the target. - * Would not have made it this far if that could not have happened already, So, just pass them - * back to the called and let them handle it with their own generic exception handler/message. - * @throws IOException - * @throws ParserConfigurationException - * @throws SAXException * @throws URISyntaxException */ - public static boolean add (String name, ValidationTarget target, boolean isObs) + public static boolean add(String name, ValidationTarget target, boolean isObs) throws IOException, ParserConfigurationException, SAXException, URISyntaxException { boolean success = false; DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); @@ -43,7 +37,7 @@ public static boolean add (String name, ValidationTarget target, boolean isObs) String full_name = resolve(name, target); isObservational.put(full_name, isObs || (isObservational.containsKey(full_name) ? isObservational.get(full_name) : false)); - for (String lid : LabelUtil.getLogicalIdentifiers (domSource, target.getUrl())) { + for (String lid : LabelUtil.getLogicalIdentifiers(domSource, target.getUrl())) { if (lid.contains("::")) { lid = lid.substring (0, lid.indexOf("::")); } @@ -64,5 +58,6 @@ public static String getOtherFilename (String name, ValidationTarget target) thr } public static void reset() { knownRefs.clear(); + isObservational.clear(); } } diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java index 27cc86c8a..1660e0e33 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java @@ -331,7 +331,7 @@ private boolean validate(NodeInfo xml) { ancestor = ancestor.getParent(); isObservational |= OBS_DATA_TAGS.contains(ancestor.getLocalPart()); } - if (!CrossLabelFileAreaReferenceChecker.add (fullName, target, isObservational)) { + if (!CrossLabelFileAreaReferenceChecker.add(fullName, target, isObservational)) { this.getListener().addProblem( new ValidationProblem( new ProblemDefinition(ExceptionType.ERROR, ProblemType.DUPLICATED_FILE_AREA_REFERENCE, diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java index fb323f6c6..3f8830fe1 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java @@ -59,6 +59,10 @@ import gov.nasa.pds.validate.constants.Constants; import net.sf.saxon.trans.XPathException; import net.sf.saxon.tree.tiny.TinyNodeImpl; +import javax.xml.transform.dom.DOMSource; +import gov.nasa.pds.tools.util.LabelCacheEntry; +import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; /** * Implements a validation chain that validates PDS4 bundles. It is applicable if there is a bundle @@ -318,6 +322,7 @@ public void validateLabel() { if (document != null) { getContext().put(PDS4Context.LABEL_DOCUMENT, document); labelIsValidFlag = true; // A non-null document signified that the label is valid. + cacheIdentifiers(document, getTarget()); } } catch (SAXException | IOException | ParserConfigurationException | TransformerException | MissingLabelSchemaException e) { @@ -337,6 +342,22 @@ public void validateLabel() { LOG.debug("validateLabel:target,labelIsValidFlag {},{}", target, labelIsValidFlag); } + private static void cacheIdentifiers(Document document, URL targetUrl) { + DOMSource domSource = new DOMSource(document); + String[] tagsArr = {LabelUtil.LIDVID_REFERENCE, LabelUtil.LID_REFERENCE}; + ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, false); + ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, false); + ArrayList contextRefs = new ArrayList<>(); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_INVESTIGATION_AREA_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_OBSERVATION_SYSTEM_COMPONENT_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_TARGET_IDENTIFICATION_REFERENCE, false)); + ReferentialIntegrityUtil.cacheLabelIdentifiers(targetUrl, + new LabelCacheEntry(logicalIds, lidVidRefs, contextRefs)); + } + private boolean resolveSingleSchema(URL label, Map.Entry schemaLocation, URL schemaUrl, ProblemContainer container, ProblemContainer labelProblems, XMLCatalogResolver resolver) { diff --git a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java index 0122f46dc..6f92265a8 100644 --- a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java +++ b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java @@ -104,6 +104,7 @@ import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; import gov.nasa.pds.tools.util.XMLExtractor; import gov.nasa.pds.tools.validate.ContentProblem; +import gov.nasa.pds.tools.validate.CrossLabelFileAreaReferenceChecker; import gov.nasa.pds.tools.validate.InMemoryRegistrar; import gov.nasa.pds.tools.validate.ProblemContainer; import gov.nasa.pds.tools.validate.ProblemDefinition; @@ -1627,6 +1628,7 @@ public boolean doValidation(Map checksumManifest) throws Exception // Due to the util class ReferentialIntegrityUtil being static, it need to be // reset() if running a regression test. ReferentialIntegrityUtil.reset(); + CrossLabelFileAreaReferenceChecker.reset(); return success; } From 567293eb9c501bc6215dfae8f7e6bc2b0531fad1 Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Tue, 19 May 2026 03:48:06 -0700 Subject: [PATCH 2/2] Complete cache optimization: eliminate disk re-parse in referential integrity phase - LabelValidationRule.cacheIdentifiers() now reports \n errors (reportCarriageReturns=true) so the referential integrity phase can safely use cached identifiers without re-parsing - CrossLabelFileAreaReferenceChecker.add() uses cached logicalIdentifiers when available, falling back to disk parse only for labels not in the initial validation pass - ReferentialIntegrityUtil.additionalReferentialIntegrityChecks() uses cached logicalIdentifiers and lidOrLidVidReferences when available, eliminating all disk re-parsing for the common case; fallback parse retained for uncached labels All 297 tests pass. Resolves the full acceptance criteria for #1568. Co-Authored-By: Claude Sonnet 4 --- .../tools/util/ReferentialIntegrityUtil.java | 29 ++++++++++--------- .../CrossLabelFileAreaReferenceChecker.java | 18 +++++++++--- .../rule/pds4/LabelValidationRule.java | 6 ++-- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java index e653dd6e7..2561dab04 100644 --- a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java @@ -820,20 +820,23 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun if (TargetExaminer.isTargetCollectionType (child.getUrl())) { labelIsCollectionFlag = true; } - // Always re-parse from disk to correctly detect \n in lid_reference/lidvid_reference - // elements and report INVALID_FIELD_VALUE errors. The pds4-jparser DOM (used for - // caching) normalizes \n away, so we cannot rely on cached identifier lists here. - // The cache is kept for the collectAllContextReferences optimization below. LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(url); - xml = db.parse(url.openStream()); - domSource = new DOMSource(xml); - // Note that the function getLidVidReferences() collects all Internal_Reference - // elements in the PDS4 core namespace (including those in Reference_List, - // Context_Area, discipline LDD areas, and any other location in the label). - // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all - // references collected in lidOrLidVidReferences for each label. - ArrayList lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); - ArrayList logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + ArrayList lidOrLidVidReferences; + ArrayList logicalIdentifiers; + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + lidOrLidVidReferences = cached.getLidOrLidVidReferences(); + } else { + xml = db.parse(url.openStream()); + domSource = new DOMSource(xml); + // Note that the function getLidVidReferences() collects all Internal_Reference + // elements in the PDS4 core namespace (including those in Reference_List, + // Context_Area, discipline LDD areas, and any other location in the label). + // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all + // references collected in lidOrLidVidReferences for each label. + lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + } LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url, lidOrLidVidReferences.size()); LOG.debug("additionalReferentialIntegrityChecks:url,logicalIdentifiers {},{}", url, diff --git a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java index e0c01c6b2..fe37e2004 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java +++ b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -11,7 +12,9 @@ import javax.xml.transform.dom.DOMSource; import org.w3c.dom.Document; import org.xml.sax.SAXException; +import gov.nasa.pds.tools.util.LabelCacheEntry; import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; public class CrossLabelFileAreaReferenceChecker { final private static HashMap isObservational = new HashMap(); @@ -31,13 +34,20 @@ private static String resolve (String name, ValidationTarget target) throws URIS public static boolean add(String name, ValidationTarget target, boolean isObs) throws IOException, ParserConfigurationException, SAXException, URISyntaxException { boolean success = false; - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); - DOMSource domSource = new DOMSource(xml); String full_name = resolve(name, target); isObservational.put(full_name, isObs || (isObservational.containsKey(full_name) ? isObservational.get(full_name) : false)); - for (String lid : LabelUtil.getLogicalIdentifiers(domSource, target.getUrl())) { + ArrayList logicalIdentifiers; + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(target.getUrl()); + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + } else { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); + DOMSource domSource = new DOMSource(xml); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, target.getUrl()); + } + for (String lid : logicalIdentifiers) { if (lid.contains("::")) { lid = lid.substring (0, lid.indexOf("::")); } diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java index 3f8830fe1..3141097de 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java @@ -345,8 +345,10 @@ public void validateLabel() { private static void cacheIdentifiers(Document document, URL targetUrl) { DOMSource domSource = new DOMSource(document); String[] tagsArr = {LabelUtil.LIDVID_REFERENCE, LabelUtil.LID_REFERENCE}; - ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, false); - ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, false); + // Report \n errors here (true), so referential integrity phase can use cache safely. + // Context area calls stay false — getLidVidReferences(true) above already covers those nodes. + ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, true); + ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, true); ArrayList contextRefs = new ArrayList<>(); contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, LabelUtil.CONTEXT_AREA_INVESTIGATION_AREA_REFERENCE, false));