From ba93096560c706548f58e30d3819591470d4c6d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 12 Nov 2025 10:14:09 +0000 Subject: [PATCH] feat: detect binary artifacts during scanning CyberFerret scanned only text before, so binary files committed by mistake (build outputs, archives) went unnoticed. The scanner now reports each binary file as a BINARY_ARTIFACT finding and fails the scan, so the CLI/pre-commit hook exits non-zero. A file counts as binary when its first 16 KiB contains a NUL byte; UTF-16 text is detected by its encoding and kept out of the check. Supported image formats are scanned for embedded metadata instead of being flagged. Legitimate binaries can be whitelisted with the new (binary-exclude) dictionary key, a comma-separated list of file-name regex patterns; the key applies to both the GUI and the CLI. Co-Authored-By: Claude Opus 4.8 --- README.md | 8 +- .../exadmin/cyberferret/CyberFerretCLI.java | 1 + .../cyberferret/async/RunnableScanner.java | 41 +++++++ .../cyberferret/async/RunnableSigsLoader.java | 27 +++++ .../exadmin/cyberferret/utils/FileUtils.java | 48 ++++++++ .../async/RunnableScannerTests.java | 85 ++++++++++++++ .../cyberferret/utils/FileUtilsTests.java | 105 ++++++++++++++++++ .../cyberferret/fxui/SceneBuilder.java | 1 + 8 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 common/src/test/java/com/github/exadmin/cyberferret/utils/FileUtilsTests.java diff --git a/README.md b/README.md index ef5fa12..cdbe73b 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,12 @@ Set VM options "--module-path "...\JDKs\javafx-sdk-24.0.1\lib" --add-modules ja # Reserved key 'VERSION' is used for users notifications only, may be skipped VERSION=1.1 -# All key names may be in 3 formats +# All key names may be in 4 formats # KEY_NAME=VALUE - means the ferret will search for VALUE-string case-insensitive, the VALUE-string will be converted to RegExp pattern '\bVALUE\b'. Note: all spaces inside will be replaced with '\\s+', all special chars (&, -, +) will be escaped by '\\' # KEY_NAME(regexp)=VALUE - means you have finally defined RegExp pattern, and it will be used as is # KEY_NAME(allowed)=VALUE - means you have defined exact string - which may be found during scanned, but must be treated as allowed. Actually no matter what key name will be used - the value is a global string. # KEY_NAME(exclude-ext)=VALUE1,VALUE2,etc.. - list of file extentions to be ignored for the "KEY_NAME" signature +# BINARY_ARTIFACTS(binary-exclude)=PATTERN1,PATTERN2,etc.. - list of regex patterns for binary files to exclude from detection (e.g., gradle-wrapper\.jar) # Notes: all key names must be unique Examples @@ -93,4 +94,9 @@ PASSW-003=qwerty123 IP-ADDR(regexp)=((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) IP-ADDR-1(allowed)=0.0.0.0 IP-ADDR-2(allowed)=127.0.0.1 + +# Binary Artifacts Detection +# The tool automatically detects binary files (files with zero bytes in first 16KiB) +# Use binary-exclude to whitelist binaries that are committed on purpose, e.g. the Gradle and Maven wrapper jars +BINARY_ARTIFACTS(binary-exclude)=gradle-wrapper\\.jar,maven-wrapper\\.jar ``` diff --git a/cli/src/main/java/com/github/exadmin/cyberferret/CyberFerretCLI.java b/cli/src/main/java/com/github/exadmin/cyberferret/CyberFerretCLI.java index 8162532..4dfa93f 100644 --- a/cli/src/main/java/com/github/exadmin/cyberferret/CyberFerretCLI.java +++ b/cli/src/main/java/com/github/exadmin/cyberferret/CyberFerretCLI.java @@ -139,6 +139,7 @@ private static void _main(String[] args) { runnableScanner.setSignaturesMap(sigsLoader.getSignaturesMap()); runnableScanner.setAllowedSignaturesMap(sigsLoader.getAllowedSignaturesMap()); runnableScanner.setExcludeExtMap(sigsLoader.getExcludeExtsMap()); + runnableScanner.setBinaryExcludePatterns(sigsLoader.getBinaryExcludePatterns()); runnableScanner.setDirToScan(repoPathToScan.toString()); runnableScanner.setStagedFiles(stagedFiles); runnableScanner.run(); diff --git a/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java b/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java index 008ef8a..988b896 100644 --- a/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java +++ b/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java @@ -6,6 +6,7 @@ import com.github.exadmin.cyberferret.model.FoundPathItem; import com.github.exadmin.cyberferret.model.ItemType; import com.github.exadmin.cyberferret.utils.FileUtils; +import com.github.exadmin.cyberferret.utils.ImgUtils; import com.github.exadmin.cyberferret.utils.MiscUtils; import java.io.File; @@ -29,6 +30,7 @@ public class RunnableScanner extends ARunnable { private FxCallback fxCallback = (type, message) -> logInfo(message); private boolean isAnySignatureFound = false; private List stagedFiles; + private List binaryExcludePatterns = null; public RunnableScanner(boolean isCLIMode) { super(isCLIMode); @@ -46,6 +48,10 @@ public void setExcludeExtMap(Map> excludeExtMap) { this.excludeExtMap = excludeExtMap; } + public void setBinaryExcludePatterns(List binaryExcludePatterns) { + this.binaryExcludePatterns = binaryExcludePatterns; + } + public void setDirToScan(String dirToScan) { this.dirToScan = dirToScan; } @@ -256,6 +262,41 @@ private void scan(FoundPathItem pathItem, Path rootDir, ExcludeFileModel exclude if (pathItem.getType() == ItemType.DIRECTORY || pathItem.getType() == ItemType.SIGNATURE) return; Path filePath = pathItem.getFilePath(); + + // Binary files are usually committed by accident, so report them as artifacts instead of + // text-scanning them. Supported images are the exception: readFile() extracts their metadata + // (e.g. EXIF) and scans that text, so they must fall through to the normal path below. + boolean isBinary = false; + try { + isBinary = FileUtils.isBinaryFile(filePath); + } catch (IOException ex) { + logWarn("Could not determine if file '{}' is binary. Treating as text.", filePath); + } + + String extension = FileUtils.getFileExtensionAsString(filePath); + if (isBinary && !ImgUtils.isSupportedImageFormat(extension)) { + if (FileUtils.matchesAnyPattern(filePath, binaryExcludePatterns)) { + logDebug("Binary file '{}' is excluded from detection", filePath); + return; + } + + FoundPathItem binaryItem = new FoundPathItem(filePath, ItemType.SIGNATURE, pathItem); + binaryItem.setVisualName("BINARY_ARTIFACT"); + binaryItem.setLineNumber(0); + binaryItem.setDisplayText("Binary file detected (contains zero bytes in first 16KiB)"); + binaryItem.setFoundString(filePath.getFileName().toString()); + + calculateIgnoreFlagState(binaryItem, pathItem, rootDir, excludeFileModel); + foundItemsContainer.addItem(binaryItem); + + // a non-ignored artifact must fail the scan so the CLI/pre-commit hook exits non-zero + if (!binaryItem.isIgnored()) { + isAnySignatureFound = true; + logWarn("Binary artifact detected: {}", filePath); + } + return; + } + String fileBody; try { logTrace("Reading file {}", filePath); diff --git a/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableSigsLoader.java b/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableSigsLoader.java index f222980..89a3922 100644 --- a/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableSigsLoader.java +++ b/common/src/main/java/com/github/exadmin/cyberferret/async/RunnableSigsLoader.java @@ -15,6 +15,7 @@ public class RunnableSigsLoader extends ARunnable { private Map signaturesMap; // map of signatures private Map allowedSignaturesMap; // effectively the list of exact strings which are allowed when capturing private Map> excludeExtsMap; // signature -> List of file extensions to ignore + private List binaryExcludePatterns; // list of file name patterns to exclude from binary detection private String dictionaryVersion = "undefined"; private InputStream inputStream; @@ -38,6 +39,10 @@ public Map> getExcludeExtsMap() { return excludeExtsMap; } + public List getBinaryExcludePatterns() { + return binaryExcludePatterns; + } + public boolean isReady() { return isReady.get(); } @@ -56,6 +61,7 @@ public void _run() { Map allowedSignaturesTmpMap = new HashMap<>(); Map> includeExt = new HashMap<>(); Map> excludeExtTmpMap = new HashMap<>(); + List binaryExcludePatternsTmp = new ArrayList<>(); for (Object key : properties.keySet()) { String sigId = key.toString(); @@ -81,6 +87,25 @@ public void _run() { continue; } + // load binary file exclusion patterns + if (sigId.endsWith("(binary-exclude)")) { + String[] patterns = expression.split(","); + for (String patternStr : patterns) { + patternStr = patternStr.trim(); + if (!patternStr.isEmpty()) { + try { + Pattern pattern = Pattern.compile(patternStr); + binaryExcludePatternsTmp.add(pattern); + logInfo("Binary exclusion pattern loaded: '{}'", patternStr); + } catch (PatternSyntaxException pse) { + logError("Error while compiling binary exclusion pattern '{}'", patternStr, pse); + } + } + } + + continue; + } + if (sigId.endsWith("(allowed)")) { sigId = sigId.substring(0, sigId.length() - 9); @@ -94,9 +119,11 @@ public void _run() { signaturesMap = Collections.unmodifiableMap(regExpTmpMap); allowedSignaturesMap = Collections.unmodifiableMap(allowedSignaturesTmpMap); excludeExtsMap = Collections.unmodifiableMap(excludeExtTmpMap); + binaryExcludePatterns = Collections.unmodifiableList(binaryExcludePatternsTmp); logInfo("Signatures are loaded successfully, number of signatures is {}", signaturesMap.size()); logInfo("Number of allowed signatures is {}", allowedSignaturesMap.size()); + logInfo("Number of binary exclusion patterns is {}", binaryExcludePatterns.size()); logInfo("Dictionary version is {}", dictionaryVersion); isReady.set(true); diff --git a/common/src/main/java/com/github/exadmin/cyberferret/utils/FileUtils.java b/common/src/main/java/com/github/exadmin/cyberferret/utils/FileUtils.java index 24c0312..ac02031 100644 --- a/common/src/main/java/com/github/exadmin/cyberferret/utils/FileUtils.java +++ b/common/src/main/java/com/github/exadmin/cyberferret/utils/FileUtils.java @@ -8,6 +8,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; public class FileUtils { public static String readFile(String filePath) throws IOException { @@ -63,6 +66,51 @@ public static InputStream toFileInputStream(Path path) { } } + public static boolean isBinaryFile(Path path) throws IOException { + if (!Files.isRegularFile(path)) { + return false; + } + + byte[] buffer = new byte[16 * 1024]; + int bytesRead; + try (InputStream is = Files.newInputStream(path)) { + bytesRead = is.read(buffer); + } + if (bytesRead <= 0) { + return false; + } + + byte[] head = bytesRead == buffer.length ? buffer : Arrays.copyOf(buffer, bytesRead); + + // UTF-16 text legitimately contains NUL bytes, so classify the head the same way + // readFile does and skip the NUL check when it is recognised as UTF-16. + Charset charset = decodeText(head).charset(); + if (charset == StandardCharsets.UTF_16LE || charset == StandardCharsets.UTF_16BE) { + return false; + } + + for (byte b : head) { + if (b == 0) { + return true; + } + } + return false; + } + + public static boolean matchesAnyPattern(Path path, List patterns) { + if (path == null || patterns == null || patterns.isEmpty()) { + return false; + } + + String fileName = path.getFileName().toString(); + for (Pattern pattern : patterns) { + if (pattern.matcher(fileName).matches()) { + return true; + } + } + return false; + } + private static DecodedText decodeText(byte[] bytes) { if (startsWith(bytes, (byte) 0xEF, (byte) 0xBB, (byte) 0xBF)) { return new DecodedText(StandardCharsets.UTF_8, 3); diff --git a/common/src/test/java/com/github/exadmin/cyberferret/async/RunnableScannerTests.java b/common/src/test/java/com/github/exadmin/cyberferret/async/RunnableScannerTests.java index abff0ee..d77bba0 100644 --- a/common/src/test/java/com/github/exadmin/cyberferret/async/RunnableScannerTests.java +++ b/common/src/test/java/com/github/exadmin/cyberferret/async/RunnableScannerTests.java @@ -21,6 +21,9 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public class RunnableScannerTests { + // PNG magic header: a real binary signature used as a stand-in for a committed binary file + private static final byte[] PNG_HEADER = {(byte) 0x89, 'P', 'N', 'G', 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D}; + @TempDir Path tempDir; @@ -100,6 +103,88 @@ public void cliMode_findsSignatureInUtf16LeFileCreatedByWindowsPowershell() thro assertTrue(runnableScanner.isAnySignatureFound()); } + @Test + public void cliMode_flagsBinaryFileAsArtifact() throws IOException { + Path repoRoot = tempDir.resolve("repo"); + Files.createDirectories(repoRoot.resolve(".git")); + Files.writeString(repoRoot.resolve(".git/config"), "[core]", StandardCharsets.UTF_8); + + Path stagedFile = repoRoot.resolve("blob.bin"); + Files.write(stagedFile, PNG_HEADER); + + FoundItemsContainer foundItemsContainer = new FoundItemsContainer(); + RunnableScanner runnableScanner = new RunnableScanner(true); + runnableScanner.setDirToScan(repoRoot.toString()); + runnableScanner.setFoundItemsContainer(foundItemsContainer); + runnableScanner.setSignaturesMap(Map.of("test", Pattern.compile("secret"))); + runnableScanner.setAllowedSignaturesMap(Map.of()); + runnableScanner.setExcludeExtMap(Map.of()); + runnableScanner.setStagedFiles(List.of(stagedFile)); + + runnableScanner.run(); + + List artifacts = foundItemsContainer.getFoundItemsCopy().stream() + .filter(item -> "BINARY_ARTIFACT".equals(item.getVisualName())) + .toList(); + assertEquals(1, artifacts.size()); + assertEquals(stagedFile.toAbsolutePath().normalize(), artifacts.getFirst().getFilePath()); + // a binary artifact must fail the scan so the CLI/pre-commit hook exits non-zero + assertTrue(runnableScanner.isAnySignatureFound()); + } + + @Test + public void cliMode_skipsBinaryFileMatchingExcludePattern() throws IOException { + Path repoRoot = tempDir.resolve("repo"); + Files.createDirectories(repoRoot.resolve(".git")); + Files.writeString(repoRoot.resolve(".git/config"), "[core]", StandardCharsets.UTF_8); + + Path stagedFile = repoRoot.resolve("archive.dat"); + Files.write(stagedFile, PNG_HEADER); + + FoundItemsContainer foundItemsContainer = new FoundItemsContainer(); + RunnableScanner runnableScanner = new RunnableScanner(true); + runnableScanner.setDirToScan(repoRoot.toString()); + runnableScanner.setFoundItemsContainer(foundItemsContainer); + runnableScanner.setSignaturesMap(Map.of("test", Pattern.compile("secret"))); + runnableScanner.setAllowedSignaturesMap(Map.of()); + runnableScanner.setExcludeExtMap(Map.of()); + runnableScanner.setBinaryExcludePatterns(List.of(Pattern.compile(".*\\.dat"))); + runnableScanner.setStagedFiles(List.of(stagedFile)); + + runnableScanner.run(); + + boolean anyArtifact = foundItemsContainer.getFoundItemsCopy().stream() + .anyMatch(item -> "BINARY_ARTIFACT".equals(item.getVisualName())); + assertFalse(anyArtifact); + assertFalse(runnableScanner.isAnySignatureFound()); + } + + @Test + public void cliMode_doesNotFlagSupportedImageAsBinaryArtifact() throws IOException { + Path repoRoot = tempDir.resolve("repo"); + Files.createDirectories(repoRoot.resolve(".git")); + Files.writeString(repoRoot.resolve(".git/config"), "[core]", StandardCharsets.UTF_8); + + // images are binary, but their metadata is scanned for signatures, so they must not be flagged + Path stagedFile = repoRoot.resolve("logo.png"); + Files.write(stagedFile, PNG_HEADER); + + FoundItemsContainer foundItemsContainer = new FoundItemsContainer(); + RunnableScanner runnableScanner = new RunnableScanner(true); + runnableScanner.setDirToScan(repoRoot.toString()); + runnableScanner.setFoundItemsContainer(foundItemsContainer); + runnableScanner.setSignaturesMap(Map.of("test", Pattern.compile("secret"))); + runnableScanner.setAllowedSignaturesMap(Map.of()); + runnableScanner.setExcludeExtMap(Map.of()); + runnableScanner.setStagedFiles(List.of(stagedFile)); + + runnableScanner.run(); + + boolean anyArtifact = foundItemsContainer.getFoundItemsCopy().stream() + .anyMatch(item -> "BINARY_ARTIFACT".equals(item.getVisualName())); + assertFalse(anyArtifact); + } + @Test public void cliMode_handlesAbsoluteStagedFileWhenScanDirectoryIsRelative() throws IOException { Path repoRoot = tempDir.resolve("repo"); diff --git a/common/src/test/java/com/github/exadmin/cyberferret/utils/FileUtilsTests.java b/common/src/test/java/com/github/exadmin/cyberferret/utils/FileUtilsTests.java new file mode 100644 index 0000000..ee31afc --- /dev/null +++ b/common/src/test/java/com/github/exadmin/cyberferret/utils/FileUtilsTests.java @@ -0,0 +1,105 @@ +package com.github.exadmin.cyberferret.utils; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.regex.Pattern; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class FileUtilsTests { + // PNG magic header: a real binary signature that is not valid text in any encoding + private static final byte[] PNG_HEADER = {(byte) 0x89, 'P', 'N', 'G', 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D}; + + @TempDir + Path tempDir; + + @Test + public void isBinaryFile_detectsNulByteAsBinary() throws IOException { + Path file = tempDir.resolve("blob.bin"); + Files.write(file, PNG_HEADER); + + assertTrue(FileUtils.isBinaryFile(file)); + } + + @Test + public void isBinaryFile_treatsPlainTextAsText() throws IOException { + Path file = tempDir.resolve("notes.txt"); + Files.writeString(file, "just some text", StandardCharsets.UTF_8); + + assertFalse(FileUtils.isBinaryFile(file)); + } + + @Test + public void isBinaryFile_treatsUtf16TextAsText() throws IOException { + // UTF-16 contains NUL bytes by design, yet it is still text and must be scanned + Path file = tempDir.resolve("notes-utf16.txt"); + Files.write(file, utf16LeWithBom("hack hack\r\n")); + + assertFalse(FileUtils.isBinaryFile(file)); + } + + @Test + public void isBinaryFile_treatsEmptyFileAsText() throws IOException { + Path file = tempDir.resolve("empty.txt"); + Files.write(file, new byte[0]); + + assertFalse(FileUtils.isBinaryFile(file)); + } + + @Test + public void isBinaryFile_returnsFalseForDirectory() throws IOException { + assertFalse(FileUtils.isBinaryFile(tempDir)); + } + + @Test + public void matchesAnyPattern_matchesByFileName() { + Path file = Path.of("lib", "app.jar"); + List patterns = List.of(Pattern.compile(".*\\.png"), Pattern.compile(".*\\.jar")); + + assertTrue(FileUtils.matchesAnyPattern(file, patterns)); + } + + @Test + public void matchesAnyPattern_returnsFalseWhenNothingMatches() { + Path file = Path.of("lib", "app.jar"); + List patterns = List.of(Pattern.compile(".*\\.png")); + + assertFalse(FileUtils.matchesAnyPattern(file, patterns)); + } + + @Test + public void matchesAnyPattern_returnsFalseForEmptyOrNullPatterns() { + Path file = Path.of("app.jar"); + + assertFalse(FileUtils.matchesAnyPattern(file, List.of())); + assertFalse(FileUtils.matchesAnyPattern(file, null)); + } + + @Test + public void matchesAnyPattern_whitelistsWrapperJarsButNotOtherJars() { + // the default binary-exclude patterns shipped in README.md + List defaults = List.of(Pattern.compile("gradle-wrapper\\.jar"), Pattern.compile("maven-wrapper\\.jar")); + + assertTrue(FileUtils.matchesAnyPattern(Path.of("gradle", "wrapper", "gradle-wrapper.jar"), defaults)); + assertTrue(FileUtils.matchesAnyPattern(Path.of(".mvn", "wrapper", "maven-wrapper.jar"), defaults)); + assertFalse(FileUtils.matchesAnyPattern(Path.of("build", "libs", "app.jar"), defaults)); + } + + private static byte[] utf16LeWithBom(String value) { + byte[] content = value.getBytes(StandardCharsets.UTF_16LE); + ByteBuffer buffer = ByteBuffer.allocate(content.length + 2).order(ByteOrder.LITTLE_ENDIAN); + buffer.put((byte) 0xFF); + buffer.put((byte) 0xFE); + buffer.put(content); + return buffer.array(); + } +} diff --git a/fx/src/main/java/com/github/exadmin/cyberferret/fxui/SceneBuilder.java b/fx/src/main/java/com/github/exadmin/cyberferret/fxui/SceneBuilder.java index 55b50d5..c06dae1 100644 --- a/fx/src/main/java/com/github/exadmin/cyberferret/fxui/SceneBuilder.java +++ b/fx/src/main/java/com/github/exadmin/cyberferret/fxui/SceneBuilder.java @@ -174,6 +174,7 @@ protected TitledPane createOfflineDictionaryPane() { runnableScanner.setSignaturesMap(runnableSigsLoader.getSignaturesMap()); runnableScanner.setAllowedSignaturesMap(runnableSigsLoader.getAllowedSignaturesMap()); runnableScanner.setExcludeExtMap(runnableSigsLoader.getExcludeExtsMap()); + runnableScanner.setBinaryExcludePatterns(runnableSigsLoader.getBinaryExcludePatterns()); btnLoadSigs.setDisable(false); });