diff --git a/CLAUDE.md b/CLAUDE.md index d80512740..271937b7e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,10 +23,10 @@ mvn package site # Build without running tests (useful when tests fail with extra logging) mvn clean package -DskipTests -# Run tests only +# Run full test suite (always use this to verify changes) mvn test -# Run tests excluding ReferenceIntegrityTest with specific Cucumber tags +# Run a targeted subset during development (NOT for verifying changes — excludes ReferenceIntegrityTest) mvn test -Dtest=\!ReferenceIntegrityTest* -Dcucumber.filter.tags='@v3.7.x' # Run a single Cucumber scenario by name (issue number and subtest, e.g. #1458-1) @@ -172,7 +172,7 @@ Alternatively, use `--verbose 1` flag for INFO level output in reports (default - Source/Target: Java 17 (pom.xml:172-173) ### Main Dependencies -- **pds4-jparser** (version 2.11.0) - PDS4 label parsing and validation +- **pds4-jparser** (version 3.2.0-SNAPSHOT) - PDS4 label parsing and validation; **source is at `../pds4-jparser`** — check there first when tracing errors from pds4-jparser classes (e.g. `FieldValueValidator`, `ProblemType`, parser internals) - **pds3-product-tools** (version 4.4.2) - PDS3 validation - **Saxon-HE** (version 12.9) - XSLT and XPath processing - **registry-common** (version 2.1.0-SNAPSHOT) - Registry API integration diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java new file mode 100644 index 000000000..fb920467a --- /dev/null +++ b/src/main/java/gov/nasa/pds/tools/util/LabelCacheEntry.java @@ -0,0 +1,32 @@ +package gov.nasa.pds.tools.util; + +import java.util.ArrayList; + +/** + * Immutable cache of identifier data extracted from a parsed label, used to avoid re-parsing + * during the referential integrity phase. + */ +public class LabelCacheEntry { + private final ArrayList logicalIdentifiers; + private final ArrayList lidOrLidVidReferences; + private final ArrayList contextAreaRefs; + + public LabelCacheEntry(ArrayList logicalIdentifiers, + ArrayList lidOrLidVidReferences, ArrayList contextAreaRefs) { + this.logicalIdentifiers = logicalIdentifiers; + this.lidOrLidVidReferences = lidOrLidVidReferences; + this.contextAreaRefs = contextAreaRefs; + } + + public ArrayList getLogicalIdentifiers() { + return logicalIdentifiers; + } + + public ArrayList getLidOrLidVidReferences() { + return lidOrLidVidReferences; + } + + public ArrayList getContextAreaRefs() { + return contextAreaRefs; + } +} diff --git a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java index f59486b55..a7698a0f1 100644 --- a/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/LabelUtil.java @@ -365,6 +365,11 @@ public static synchronized ArrayList getIdentifiersCommon(DOMSource sour * @return lidOrLidVidReference The LID or LIDVID referenced in this label. */ public static ArrayList getLidVidReferences(DOMSource source, URL context) { + return getLidVidReferences(source, context, true); + } + + public static ArrayList getLidVidReferences(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLidVidReferences:MY_SOURCE[{}]", source); String[] tagsList = new String[2]; @@ -372,7 +377,8 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex tagsList[1] = LID_REFERENCE; ArrayList lidOrLidVidReferences = - LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, INTERNAL_REFERENCE_AREA, + reportCarriageReturns); LOG.debug("getLidVidReferences:context,lidOrLidVidReferences {},{}", context, lidOrLidVidReferences); @@ -387,13 +393,19 @@ public static ArrayList getLidVidReferences(DOMSource source, URL contex * @return logicalIdentifiers A list of logical identifiers in this label. */ public static ArrayList getLogicalIdentifiers(DOMSource source, URL context) { + return getLogicalIdentifiers(source, context, true); + } + + public static ArrayList getLogicalIdentifiers(DOMSource source, URL context, + boolean reportCarriageReturns) { LOG.debug("getLogicalIdentifiers:MY_SOURCE[{}]", source); String[] tagsList = new String[1]; tagsList[0] = LOGICAL_IDENTIFIER_TAG; ArrayList logicalIdentifiers = - LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA); + LabelUtil.getIdentifiersCommon(source, context, tagsList, IDENTIFICATION_AREA, + reportCarriageReturns); LOG.debug("getLogicalIdentifiers:context,logicalIdentifiers {},{}", context, logicalIdentifiers); diff --git a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java index 030b2ab01..2561dab04 100644 --- a/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java +++ b/src/main/java/gov/nasa/pds/tools/util/ReferentialIntegrityUtil.java @@ -142,6 +142,7 @@ public class ReferentialIntegrityUtil { private static String[] tagsList = new String[2]; private static HashSet reportedErrorsReferenceSet = new HashSet<>(); private static URL parentBundleURL = null; + private static HashMap labelIdentifierCache = new HashMap<>(); /** * Initialize this class to ready for doing referential checks. @@ -163,6 +164,14 @@ public static void initialize(String referenceType, URL target, ProblemListener ReferentialIntegrityUtil.tagsList[1] = LabelUtil.LID_REFERENCE; } + public static void cacheLabelIdentifiers(URL url, LabelCacheEntry entry) { + labelIdentifierCache.put(url, entry); + } + + public static LabelCacheEntry getCachedLabelIdentifiers(URL url) { + return labelIdentifierCache.get(url); + } + /** * Reset this class to its initial state in case running from regression tests. * @@ -188,6 +197,7 @@ public static void reset() { ReferentialIntegrityUtil.urlsParsedCumulative.clear(); ReferentialIntegrityUtil.reportedErrorsReferenceSet.clear(); ReferentialIntegrityUtil.parentBundleURL = null; + ReferentialIntegrityUtil.labelIdentifierCache.clear(); } /** @@ -577,6 +587,30 @@ private static void addUniqueReferencesToMap(HashMap allContextAreaRefs, + ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, + boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { + if ((logicalIdentifiers == null) || logicalIdentifiers.isEmpty()) { + return; + } + if (labelIsCollectionFlag || labelIsBundleFlag) { + if (labelIsBundleFlag) { + ReferentialIntegrityUtil.addUniqueReferencesToMap(ReferentialIntegrityUtil.bundleReferenceMap, + allContextAreaRefs, url, logicalIdentifiers.get(0)); + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.collectionReferenceMap, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + } else { + ReferentialIntegrityUtil.addUniqueReferencesToMap( + ReferentialIntegrityUtil.contextReferencesCumulative, allContextAreaRefs, url, + logicalIdentifiers.get(0)); + } + LOG.debug("collectAllContextReferences:url,contextReferencesCumulative {},{},{}", url, + contextReferencesCumulative, contextReferencesCumulative.size()); + } + private static void collectAllContextReferences(DOMSource domSource, ArrayList logicalIdentifiers, ArrayList lidOrLidVidReferences, boolean labelIsBundleFlag, boolean labelIsCollectionFlag, URL url) { @@ -786,16 +820,23 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun if (TargetExaminer.isTargetCollectionType (child.getUrl())) { labelIsCollectionFlag = true; } - xml = db.parse(url.openStream()); - domSource = new DOMSource(xml); - // Note that the function getLidVidReferences() collects all Internal_Reference - // elements in the PDS4 core namespace (including those in Reference_List, - // Context_Area, discipline LDD areas, and any other location in the label). - // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all - // references collected in lidOrLidVidReferences for each label. - - ArrayList lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); - ArrayList logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(url); + ArrayList lidOrLidVidReferences; + ArrayList logicalIdentifiers; + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + lidOrLidVidReferences = cached.getLidOrLidVidReferences(); + } else { + xml = db.parse(url.openStream()); + domSource = new DOMSource(xml); + // Note that the function getLidVidReferences() collects all Internal_Reference + // elements in the PDS4 core namespace (including those in Reference_List, + // Context_Area, discipline LDD areas, and any other location in the label). + // so the lidOrLidVidReferencesCumulative will be a cumulative collection of all + // references collected in lidOrLidVidReferences for each label. + lidOrLidVidReferences = LabelUtil.getLidVidReferences(domSource, url); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, url); + } LOG.debug("additionalReferentialIntegrityChecks:url,lidOrLidVidReferences {},{}", url, lidOrLidVidReferences.size()); LOG.debug("additionalReferentialIntegrityChecks:url,logicalIdentifiers {},{}", url, @@ -871,8 +912,14 @@ public static void additionalReferentialIntegrityChecks(URL crawlTarget, URL bun // Collect all the context references defined for each label under the // "Context_Area" tag. if (ReferentialIntegrityUtil.contextReferenceCheck) { - ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, - lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + if (cached != null) { + ReferentialIntegrityUtil.collectAllContextReferences(cached.getContextAreaRefs(), + logicalIdentifiers, lidOrLidVidReferences, labelIsBundleFlag, + labelIsCollectionFlag, url); + } else { + ReferentialIntegrityUtil.collectAllContextReferences(domSource, logicalIdentifiers, + lidOrLidVidReferences, labelIsBundleFlag, labelIsCollectionFlag, url); + } } } else { diff --git a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java index 9832e102f..fe37e2004 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java +++ b/src/main/java/gov/nasa/pds/tools/validate/CrossLabelFileAreaReferenceChecker.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -11,7 +12,9 @@ import javax.xml.transform.dom.DOMSource; import org.w3c.dom.Document; import org.xml.sax.SAXException; +import gov.nasa.pds.tools.util.LabelCacheEntry; import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; public class CrossLabelFileAreaReferenceChecker { final private static HashMap isObservational = new HashMap(); @@ -26,24 +29,25 @@ private static String resolve (String name, ValidationTarget target) throws URIS * @param name - the file being referenced by the file area * @param target - the label being validated * @return true if name and target are unique and only known references and false otherwise - * All of these exceptions should not happen because they are getting the LID from the target. - * Would not have made it this far if that could not have happened already, So, just pass them - * back to the called and let them handle it with their own generic exception handler/message. - * @throws IOException - * @throws ParserConfigurationException - * @throws SAXException * @throws URISyntaxException */ - public static boolean add (String name, ValidationTarget target, boolean isObs) + public static boolean add(String name, ValidationTarget target, boolean isObs) throws IOException, ParserConfigurationException, SAXException, URISyntaxException { boolean success = false; - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); - DOMSource domSource = new DOMSource(xml); String full_name = resolve(name, target); isObservational.put(full_name, isObs || (isObservational.containsKey(full_name) ? isObservational.get(full_name) : false)); - for (String lid : LabelUtil.getLogicalIdentifiers (domSource, target.getUrl())) { + ArrayList logicalIdentifiers; + LabelCacheEntry cached = ReferentialIntegrityUtil.getCachedLabelIdentifiers(target.getUrl()); + if (cached != null) { + logicalIdentifiers = cached.getLogicalIdentifiers(); + } else { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + Document xml = dbf.newDocumentBuilder().parse(target.getUrl().openStream()); + DOMSource domSource = new DOMSource(xml); + logicalIdentifiers = LabelUtil.getLogicalIdentifiers(domSource, target.getUrl()); + } + for (String lid : logicalIdentifiers) { if (lid.contains("::")) { lid = lid.substring (0, lid.indexOf("::")); } @@ -64,5 +68,6 @@ public static String getOtherFilename (String name, ValidationTarget target) thr } public static void reset() { knownRefs.clear(); + isObservational.clear(); } } diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java index 27cc86c8a..1660e0e33 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/FileReferenceValidationRule.java @@ -331,7 +331,7 @@ private boolean validate(NodeInfo xml) { ancestor = ancestor.getParent(); isObservational |= OBS_DATA_TAGS.contains(ancestor.getLocalPart()); } - if (!CrossLabelFileAreaReferenceChecker.add (fullName, target, isObservational)) { + if (!CrossLabelFileAreaReferenceChecker.add(fullName, target, isObservational)) { this.getListener().addProblem( new ValidationProblem( new ProblemDefinition(ExceptionType.ERROR, ProblemType.DUPLICATED_FILE_AREA_REFERENCE, diff --git a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java index fb323f6c6..3141097de 100644 --- a/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java +++ b/src/main/java/gov/nasa/pds/tools/validate/rule/pds4/LabelValidationRule.java @@ -59,6 +59,10 @@ import gov.nasa.pds.validate.constants.Constants; import net.sf.saxon.trans.XPathException; import net.sf.saxon.tree.tiny.TinyNodeImpl; +import javax.xml.transform.dom.DOMSource; +import gov.nasa.pds.tools.util.LabelCacheEntry; +import gov.nasa.pds.tools.util.LabelUtil; +import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; /** * Implements a validation chain that validates PDS4 bundles. It is applicable if there is a bundle @@ -318,6 +322,7 @@ public void validateLabel() { if (document != null) { getContext().put(PDS4Context.LABEL_DOCUMENT, document); labelIsValidFlag = true; // A non-null document signified that the label is valid. + cacheIdentifiers(document, getTarget()); } } catch (SAXException | IOException | ParserConfigurationException | TransformerException | MissingLabelSchemaException e) { @@ -337,6 +342,24 @@ public void validateLabel() { LOG.debug("validateLabel:target,labelIsValidFlag {},{}", target, labelIsValidFlag); } + private static void cacheIdentifiers(Document document, URL targetUrl) { + DOMSource domSource = new DOMSource(document); + String[] tagsArr = {LabelUtil.LIDVID_REFERENCE, LabelUtil.LID_REFERENCE}; + // Report \n errors here (true), so referential integrity phase can use cache safely. + // Context area calls stay false — getLidVidReferences(true) above already covers those nodes. + ArrayList logicalIds = LabelUtil.getLogicalIdentifiers(domSource, targetUrl, true); + ArrayList lidVidRefs = LabelUtil.getLidVidReferences(domSource, targetUrl, true); + ArrayList contextRefs = new ArrayList<>(); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_INVESTIGATION_AREA_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_OBSERVATION_SYSTEM_COMPONENT_REFERENCE, false)); + contextRefs.addAll(LabelUtil.getIdentifiersCommon(domSource, targetUrl, tagsArr, + LabelUtil.CONTEXT_AREA_TARGET_IDENTIFICATION_REFERENCE, false)); + ReferentialIntegrityUtil.cacheLabelIdentifiers(targetUrl, + new LabelCacheEntry(logicalIds, lidVidRefs, contextRefs)); + } + private boolean resolveSingleSchema(URL label, Map.Entry schemaLocation, URL schemaUrl, ProblemContainer container, ProblemContainer labelProblems, XMLCatalogResolver resolver) { diff --git a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java index 0122f46dc..6f92265a8 100644 --- a/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java +++ b/src/main/java/gov/nasa/pds/validate/ValidateLauncher.java @@ -104,6 +104,7 @@ import gov.nasa.pds.tools.util.ReferentialIntegrityUtil; import gov.nasa.pds.tools.util.XMLExtractor; import gov.nasa.pds.tools.validate.ContentProblem; +import gov.nasa.pds.tools.validate.CrossLabelFileAreaReferenceChecker; import gov.nasa.pds.tools.validate.InMemoryRegistrar; import gov.nasa.pds.tools.validate.ProblemContainer; import gov.nasa.pds.tools.validate.ProblemDefinition; @@ -1627,6 +1628,7 @@ public boolean doValidation(Map checksumManifest) throws Exception // Due to the util class ReferentialIntegrityUtil being static, it need to be // reset() if running a regression test. ReferentialIntegrityUtil.reset(); + CrossLabelFileAreaReferenceChecker.reset(); return success; }