diff --git a/CHANGELOG.md b/CHANGELOG.md index c948863..f37d38f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Upcoming changes... +### Added +- `calculateOppositeLineEndingHash()` method in `WinnowingUtils` to compute hash with opposite line endings (Unix ↔ Windows) +- FH2 hash included in WFP output format as `fh2=` +- Support for detecting CRLF (Windows), LF (Unix), and CR (legacy Mac) line endings ## [0.11.0] - 2025-05-26 ### Added diff --git a/README.md b/README.md index 0475752..8947885 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ public class Test { The package also ships with a sample CLI. It can be run using the example script [scanoss-cli.sh](scanoss-cli.sh): ```bash -scanos-cli.sh -h +scanoss-cli.sh -h ``` ### Custom Certificate @@ -91,7 +91,7 @@ packaging/releasing an update. The following commands are provided for incrementing version: ```bash -make inc_path +make inc_patch make inc_minor make inc_major ``` diff --git a/src/main/java/com/scanoss/Winnowing.java b/src/main/java/com/scanoss/Winnowing.java index c868d8e..aa2c8eb 100644 --- a/src/main/java/com/scanoss/Winnowing.java +++ b/src/main/java/com/scanoss/Winnowing.java @@ -158,6 +158,11 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c return wfpBuilder.toString(); } + String fh2 = WinnowingUtils.calculateOppositeLineEndingHash(contents); + if (fh2 != null){ + wfpBuilder.append(String.format("fh2=%s\n",fh2)); + } + if(this.isHpsm()){ wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents))); } diff --git a/src/main/java/com/scanoss/utils/WinnowingUtils.java b/src/main/java/com/scanoss/utils/WinnowingUtils.java index 0e99d15..abc15b0 100644 --- a/src/main/java/com/scanoss/utils/WinnowingUtils.java +++ b/src/main/java/com/scanoss/utils/WinnowingUtils.java @@ -22,8 +22,12 @@ */ package com.scanoss.utils; +import lombok.AllArgsConstructor; +import lombok.Getter; import lombok.NonNull; +import org.apache.commons.codec.digest.DigestUtils; +import java.io.ByteArrayOutputStream; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; @@ -37,6 +41,17 @@ */ public class WinnowingUtils { + /** + * Inner class to hold line ending detection results. + */ + @Getter + @AllArgsConstructor + public static class LineEndingInfo { + private final boolean hasCrlf; + private final boolean hasStandaloneLf; + private final boolean hasStandaloneCr; + } + /** * Normalise the given character * @@ -95,4 +110,132 @@ public static Set extractFilePathsFromWFPBlock(@NonNull String wfpBlock) return paths; } + + /** + * Calculate hash for contents with opposite line endings. + * If the file is primarily Unix (LF), calculates Windows (CRLF) hash. + * If the file is primarily Windows (CRLF), calculates Unix (LF) hash. + * + * @param contents File contents as bytes + * @return Hash with opposite line endings as hex string, or null if no line endings detected + */ + public static String calculateOppositeLineEndingHash(byte[] contents) { + LineEndingInfo lineEndingInfo = detectLineEndings(contents); + + // If no line endings detected, return null + if (!lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) { + return null; + } + + // Normalize all line endings to LF first + byte[] normalized = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{'\n'}); + normalized = replaceSequence(normalized, new byte[]{'\r'}, new byte[]{'\n'}); + + byte[] oppositeContents; + + // Determine the dominant line ending type + if (lineEndingInfo.hasCrlf && !lineEndingInfo.hasStandaloneLf && !lineEndingInfo.hasStandaloneCr) { + // File is Windows (CRLF) - produce Unix (LF) hash + oppositeContents = normalized; + } else { + // File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash + oppositeContents = replaceSequence(normalized, new byte[]{'\n'}, new byte[]{'\r', '\n'}); + } + + return DigestUtils.md5Hex(oppositeContents); + } + + /** + * Detect the types of line endings present in file contents. + * + * @param contents File contents as bytes + * @return LineEndingInfo indicating which line ending types are present + */ + private static LineEndingInfo detectLineEndings(byte[] contents) { + // Check for CRLF (Windows line endings) + boolean hasCrlf = containsSequence(contents, new byte[]{'\r', '\n'}); + + // Remove all CRLF sequences to check for standalone LF and CR + byte[] contentWithoutCrlf = replaceSequence(contents, new byte[]{'\r', '\n'}, new byte[]{}); + + // Check for standalone LF (not part of CRLF) + boolean hasStandaloneLf = containsSequence(contentWithoutCrlf, new byte[]{'\n'}); + + // Check for standalone CR (not part of CRLF) + boolean hasStandaloneCr = containsSequence(contentWithoutCrlf, new byte[]{'\r'}); + + return new LineEndingInfo(hasCrlf, hasStandaloneLf, hasStandaloneCr); + } + + /** + * Check if a byte array contains a specific sequence of bytes. + * + * @param data The byte array to search in + * @param sequence The sequence to search for + * @return true if the sequence is found, false otherwise + */ + private static boolean containsSequence(byte[] data, byte[] sequence) { + if (sequence.length == 0 || data.length < sequence.length) { + return false; + } + + for (int i = 0; i <= data.length - sequence.length; i++) { + boolean found = true; + for (int j = 0; j < sequence.length; j++) { + if (data[i + j] != sequence[j]) { + found = false; + break; + } + } + if (found) { + return true; + } + } + return false; + } + + /** + * Replace all occurrences of a byte sequence with another sequence. + * Uses ByteArrayOutputStream for better performance compared to List. + * + * @param data The original byte array + * @param search The sequence to search for + * @param replacement The sequence to replace with + * @return A new byte array with replacements made + */ + private static byte[] replaceSequence(byte[] data, byte[] search, byte[] replacement) { + if (search.length == 0) { + return data; + } + + ByteArrayOutputStream result = new ByteArrayOutputStream(data.length); + int i = 0; + + while (i < data.length) { + boolean found = false; + + // Check if we have a match at current position + if (i <= data.length - search.length) { + found = true; + for (int j = 0; j < search.length; j++) { + if (data[i + j] != search[j]) { + found = false; + break; + } + } + } + + if (found) { + // Add replacement bytes + result.write(replacement, 0, replacement.length); + i += search.length; + } else { + // Add current byte + result.write(data[i]); + i++; + } + } + + return result.toByteArray(); + } } diff --git a/src/test/java/com/scanoss/TestWinnowing.java b/src/test/java/com/scanoss/TestWinnowing.java index e75e353..c3b5fa0 100644 --- a/src/test/java/com/scanoss/TestWinnowing.java +++ b/src/test/java/com/scanoss/TestWinnowing.java @@ -105,6 +105,7 @@ public void TestWinnowingContentsHPSM() { assertNotNull(wfp); assertFalse(wfp.isEmpty()); assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" + + "fh2=0bd0edfa2f3d4903c51b9fd910409942\n" + "hpsm=df13c104d4\n" + "3=0ed5027a,a9442399,d019b836\n" + "4=613d56c0\n" + diff --git a/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java b/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java index 0785992..9bcef0b 100644 --- a/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java +++ b/src/test/java/com/scanoss/utils/WinnowingUtilsTest.java @@ -84,4 +84,110 @@ public void testExtractFilePathsFromWFPBlock_ComplexCase_HandlesCorrectly() { assertTrue(result.contains("/path/to/file2")); assertTrue(result.contains("/path/to/file3")); } + + // Tests for calculateOppositeLineEndingHash + @Test + public void testCalculateOppositeLineEndingHash_UnixToWindows_ReturnsWindowsHash() { + // Unix file with LF line endings + String unixContent = "line1\nline2\nline3\n"; + byte[] unixBytes = unixContent.getBytes(); + + // Expected: Windows content with CRLF + String windowsContent = "line1\r\nline2\r\nline3\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes); + assertEquals(expectedHash, result); + } + + @Test + public void testCalculateOppositeLineEndingHash_WindowsToUnix_ReturnsUnixHash() { + // Windows file with CRLF line endings + String windowsContent = "line1\r\nline2\r\nline3\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + + // Expected: Unix content with LF + String unixContent = "line1\nline2\nline3\n"; + byte[] unixBytes = unixContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(unixBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(windowsBytes); + assertEquals(expectedHash, result); + } + + @Test + public void testCalculateOppositeLineEndingHash_NoLineEndings_ReturnsNull() { + // Content without any line endings + String content = "single line with no line endings"; + byte[] bytes = content.getBytes(); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(bytes); + assertNull(result); + } + + @Test + public void testCalculateOppositeLineEndingHash_EmptyContent_ReturnsNull() { + byte[] emptyBytes = new byte[0]; + + String result = WinnowingUtils.calculateOppositeLineEndingHash(emptyBytes); + assertNull(result); + } + + @Test + public void testCalculateOppositeLineEndingHash_MixedLineEndings_ReturnsWindowsHash() { + // Mixed line endings (LF and CRLF) - should produce Windows hash + String mixedContent = "line1\nline2\r\nline3\n"; + byte[] mixedBytes = mixedContent.getBytes(); + + // Expected: all normalized to Windows CRLF + String windowsContent = "line1\r\nline2\r\nline3\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(mixedBytes); + assertEquals(expectedHash, result); + } + + @Test + public void testCalculateOppositeLineEndingHash_OnlyCarriageReturn_ReturnsWindowsHash() { + // Old Mac-style CR line endings + String crContent = "line1\rline2\rline3\r"; + byte[] crBytes = crContent.getBytes(); + + // Expected: Windows CRLF + String windowsContent = "line1\r\nline2\r\nline3\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(crBytes); + assertEquals(expectedHash, result); + } + + @Test + public void testCalculateOppositeLineEndingHash_SingleLineWithLF_ReturnsWindowsHash() { + String unixContent = "single line\n"; + byte[] unixBytes = unixContent.getBytes(); + + String windowsContent = "single line\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes); + assertEquals(expectedHash, result); + } + + @Test + public void testCalculateOppositeLineEndingHash_MultipleConsecutiveLineEndings_HandlesCorrectly() { + // Multiple consecutive line endings (blank lines) + String unixContent = "line1\n\n\nline2\n"; + byte[] unixBytes = unixContent.getBytes(); + + String windowsContent = "line1\r\n\r\n\r\nline2\r\n"; + byte[] windowsBytes = windowsContent.getBytes(); + String expectedHash = org.apache.commons.codec.digest.DigestUtils.md5Hex(windowsBytes); + + String result = WinnowingUtils.calculateOppositeLineEndingHash(unixBytes); + assertEquals(expectedHash, result); + } }