Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description: A Gradle Build Cache server with Redis backend for EduIDE deploymen
type: application

# Bump this version on every release — also used as the Docker image tag
version: 0.5.2
version: 0.5.3

dependencies:
- name: reposilite
Expand Down
7 changes: 7 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,11 @@ data:
logging:
level: "info"
format: "json"

static_analysis:
enabled: {{ .Values.staticAnalysis.enabled }}
check_network: {{ .Values.staticAnalysis.checkNetwork }}
check_exec: {{ .Values.staticAnalysis.checkExec }}
check_reflection: {{ .Values.staticAnalysis.checkReflection }}
check_filesystem: {{ .Values.staticAnalysis.checkFilesystem }}
{{- end }}
10 changes: 10 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ tls:
# Kubernetes secret name containing tls.crt and tls.key
secretName: ""

# ============================================================
# Static Code Analysis (Java JARs)
# ============================================================
staticAnalysis:
enabled: false
checkNetwork: false
checkExec: false
checkReflection: false
checkFilesystem: false

# ============================================================
# Gateway API (HTTPRoutes only)
# ============================================================
Expand Down
7 changes: 7 additions & 0 deletions src/configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,10 @@ metrics:
logging:
level: "info"
format: "json"

static_analysis:
enabled: false
check_network: false
check_exec: false
check_reflection: false
check_filesystem: false
Comment thread
KevinGruber2001 marked this conversation as resolved.
189 changes: 189 additions & 0 deletions src/internal/analysis/analyzer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
package analysis

import (
"archive/tar"
"archive/zip"
"bytes"
"compress/gzip"
"fmt"
"io"
"strings"

"github.com/kevingruber/gradle-cache/internal/config"
)

// Violation describes a single forbidden API reference found in a class file.
type Violation struct {
Class string `json:"class"` // e.g. "com/student/Homework"
Matched string `json:"matched"` // e.g. "java/net/Socket"
Category string `json:"category"` // e.g. "network"
}

// Analyzer inspects Java bytecode for forbidden API usage.
type Analyzer struct {
rules []rule
}

type rule struct {
category string
patterns []string
}

// New creates an Analyzer whose active rules are determined by cfg.
func New(cfg config.StaticAnalysisConfig) *Analyzer {
a := &Analyzer{}
if cfg.CheckNetwork {
a.rules = append(a.rules, rule{"network", networkPatterns})
}
if cfg.CheckExec {
a.rules = append(a.rules, rule{"exec", execPatterns})
}
if cfg.CheckReflection {
a.rules = append(a.rules, rule{"reflection", reflectionPatterns})
}
if cfg.CheckFilesystem {
a.rules = append(a.rules, rule{"filesystem", filesystemPatterns})
}
return a
}

// Check inspects data for forbidden API usage.
// Gradle cache entries are gzip-compressed tar archives — those are handled first.
// Plain ZIP/JAR files are also supported.
// Any other format is passed through silently (returns nil, nil).
func (a *Analyzer) Check(data []byte) ([]Violation, error) {
switch {
case isGzip(data):
return a.checkGzipTar(data)
case isZIP(data):
seen := make(map[string]struct{})
return a.checkZIP(data, seen)
default:
return nil, nil
}
Comment on lines +55 to +63

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Locate and print the analyzer + helpers around the referenced lines
file="src/internal/analysis/analyzer.go"
echo "== $file =="
nl -ba "$file" | sed -n '1,220p' | sed -n '45,110p'

# Find isZIP definition and checkZIP usage
echo "== Search for isZIP definition =="
rg -n "func\s+isZIP\s*\(" -S src/internal/analysis || true
rg -n "isZIP\(" -S src/internal/analysis || true

echo "== Search for checkZIP definition =="
rg -n "func\s+\(a\s+\*\s*Analyzer\)\s+checkZIP\s*\(" -S src/internal/analysis || true
rg -n "checkZIP\(" -S src/internal/analysis || true

# Also inspect any custom zip error handling if present
echo "== Search for zip.ErrFormat usage =="
rg -n "zip\.ErrFormat|errors\.Is\(.*ErrFormat|ErrFormat" -S src/internal/analysis || true

Repository: EduIDE/EduIDE-shared-cache

Length of output: 171


🏁 Script executed:

#!/bin/bash
set -euo pipefail

file="src/internal/analysis/analyzer.go"
echo "== $file =="
awk 'NR>=45 && NR<=110 {printf "%5d  %s\n", NR, $0}' "$file"

echo "== Search for isZIP definition =="
rg -n "func\s+isZIP\s*\(" -S src/internal/analysis || true

echo "== Search for isZIP\(\) usage =="
rg -n "isZIP\(" -S src/internal/analysis || true

echo "== Search for checkZIP definition =="
rg -n "func\s+\(a\s+\*\s*Analyzer\)\s+checkZIP\s*\(" -S src/internal/analysis || true

echo "== Search for checkZIP\(\) usage =="
rg -n "checkZIP\(" -S src/internal/analysis || true

echo "== Search for zip.ErrFormat / errors.Is handling =="
rg -n "zip\.ErrFormat|errors\.Is\(|ErrFormat" -S src/internal/analysis || true

Repository: EduIDE/EduIDE-shared-cache

Length of output: 3170


🏁 Script executed:

#!/bin/bash
set -euo pipefail
file="src/internal/analysis/analyzer.go"

echo "== isZIP (around 170-220) =="
awk 'NR>=165 && NR<=230 {printf "%5d  %s\n", NR, $0}' "$file"

echo "== checkZIP (around 100-170) =="
awk 'NR>=95 && NR<=170 {printf "%5d  %s\n", NR, $0}' "$file"

echo "== gzip/zip imports context =="
awk 'NR>=1 && NR<=45 {printf "%5d  %s\n", NR, $0}' "$file"

Repository: EduIDE/EduIDE-shared-cache

Length of output: 4913


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Search for Analyzer.Check call sites
rg -n "New\(cfg\).*Analyzer|\.Check\(" -S src | head -n 200

# Search for Analyzer usage specifically with shared-cache scanning
rg -n "analy[sz]er" -S src/internal | head -n 200

# Search for any tests mentioning ZIP/JAR signatures or isZIP
fd -t f . test tests __tests__ 2>/dev/null | head -n 50 || true
rg -n "isZIP|PK\\\\x03\\\\x04|zip.NewReader|jar" -S . | head -n 200

# Display full analyzer.go to see any additional context around comments (small file already ~ maybe > 220 lines)
wc -l src/internal/analysis/analyzer.go

Repository: EduIDE/EduIDE-shared-cache

Length of output: 3256


🌐 Web query:

Go archive/zip NewReader required signatures does it require first bytes to be PK\x03\x04 local file header

💡 Result:

No, the Go archive/zip package's NewReader function does not require the archive to begin with the PK\x03\x04 (local file header) signature [1][2]. Instead, NewReader initializes by locating and reading the Central Directory, which is typically found at or near the end of the ZIP file [1][3][4]. The ZIP specification defines the Central Directory as the authoritative source for the archive's contents, and Go's implementation follows this by: 1. Searching for the End of Central Directory (EOCD) record at the end of the provided io.ReaderAt [1][3]. 2. Using the information in the EOCD record to locate and parse the Central Directory [1][4]. 3. Populating the list of files based on the entries found in the Central Directory, rather than scanning from the beginning of the file [1][5][3]. Because the Central Directory contains the offsets to the local file headers for each individual file, NewReader does not need to start at the beginning of the stream [1][6]. This architecture allows the package to support reading ZIP files that may be appended to other data or exist as part of a larger stream where the local file headers are not positioned at the start [6].

Citations:


🌐 Web query:

ZIP file signatures empty zip starts with PK\x05\x06 end of central directory header

💡 Result:

In the ZIP file format, the signature PK\x05\x06 (hexadecimal 50 4B 05 06) identifies the End of Central Directory (EOCD) record [1][2][3]. According to the official PKWARE ZIP File Format Specification, every ZIP file must contain exactly one EOCD record [4][5]. An empty ZIP archive is defined as a file containing only this EOCD record [4][5]. Because the EOCD record is mandatory and serves as the anchor for the central directory, a ZIP file consisting solely of this structure is considered a valid, albeit empty, archive [4]. The EOCD record follows a specific structure [2][3]: - Signature: 4 bytes (PK\x05\x06) [2][3] - Disk numbers and entry counts: These fields (e.g., total number of entries in the central directory) are set to zero in an empty ZIP file [2][6]. - Total size and offset: The size of the central directory and its offset are also set to zero [2][3]. - Comment: An optional field for the ZIP file comment follows the fixed fields [2][3]. A common hexdump for a minimal empty ZIP file is: 50 4B 05 06 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [6]. This represents the 22-byte structure (4-byte signature + 18 bytes of fixed data) [2][7][6].

Citations:


🌐 Web query:

ZIP local file header signature 0x04034b50 end of central directory 0x06054b50 central directory header 0x02014b50

💡 Result:

The ZIP file format utilizes specific 4-byte magic number signatures to identify its various structural components. These signatures are stored in little-endian byte order, which is why they appear as 0x50 0x4b 0x03 0x04 in a hex editor, corresponding to the integer value 0x04034b50 [1][2]. The primary signatures are: Local File Header: 0x04034b50 This signature marks the beginning of a local file header, which precedes the actual compressed or stored data for each individual file within the archive [3][4][5]. It contains metadata such as the file name, compression method, and CRC-32 checksum [3][1][4]. Central Directory Header: 0x02014b50 This signature marks the beginning of a central directory header [6][7]. The central directory is located at the end of the ZIP archive and serves as a master index of all files contained within the ZIP, allowing for faster directory listing without scanning the entire file [7]. End of Central Directory Record (EOCD): 0x06054b50 This signature marks the beginning of the end of central directory record [1][2]. This record is found at the very end of the ZIP archive [1][7]. It provides critical information for the ZIP reader, such as the number of central directory records and the relative offset where the central directory starts, effectively anchoring the archive structure [1][7]. These signatures are formal parts of the PKWARE.ZIP File Format Specification (APPNOTE), which defines the architecture of ZIP archives [4][8][5].

Citations:


🌐 Web query:

archive/zip NewReader invalid zip returns zip.ErrFormat

💡 Result:

In Go's archive/zip package, zip.NewReader will return the error zip.ErrFormat when it cannot identify the provided input as a valid ZIP archive [1][2]. This error message is defined as "zip: not a valid zip file" [1][2]. The library returns this error in several specific scenarios, primarily when it fails to locate or parse required ZIP structure signatures [3]: - Signature Mismatches: The parser does not find the expected signatures for file headers, directory headers, or the end of the central directory record [3][4]. - Truncated or Malformed Data: The input may be missing critical components, or the data might not align with the offsets specified in the archive's central directory [3]. - Incorrect Input Size: A common cause for this error is providing an incorrect size to zip.NewReader [5]. The function requires the size of the ZIP file in bytes; if the provided size is inaccurate, the reader may fail to locate the end-of-central-directory record, leading to zip.ErrFormat [5]. While zip.ErrFormat is the standard indicator of an invalid or unrecognized ZIP format, it is not always easy to determine exactly which part of the archive structure caused the failure without inspecting the Go source code or using debug logging [3]. If you encounter this error, verify that the input is a complete, valid ZIP file and that the size passed to NewReader accurately matches the total byte length of the reader's underlying data source [5].

Citations:


🌐 Web query:

archive/zip ErrFormat error value 'zip: not a valid zip file'

💡 Result:

In the Go standard library, ErrFormat is an error variable defined in the archive/zip package with the message "zip: not a valid zip file" [1][2]. It is returned by various methods, such as zip.OpenReader or while iterating through zip files, when the input data does not conform to the expected ZIP file format [3][4][5]. This error typically occurs because the reader failed to find or validate essential ZIP file structures, such as [4]: * Missing or invalid ZIP signature headers (file, directory, or end-of-central-directory signatures) [6][4][5]. * Data that is truncated or shorter than the metadata claims [4][5]. * An invalid directory offset (e.g., negative or exceeding the file size) [4]. * The provided file is not actually a ZIP archive, or contains unexpected "garbage" data at the beginning or within the file structure [4]. Because ErrFormat is a sentinel error, it provides a generic message to prevent the exposure of implementation details or excessive information during parsing, which has been the subject of community discussion regarding the difficulty of debugging specific causes of format failures [3][4]. To diagnose why a specific file is triggering this error, developers often use external tools like the system's unzip command to check if the file can be validated or to see more descriptive error messages [3]. Adding local debugging logs to the internal archive/zip package source code is another common approach for identifying which specific header or offset verification is failing [4].

Citations:


🏁 Script executed:

#!/bin/bash
set -euo pipefail
file="src/internal/handler/cache_put.go"
echo "== $file =="
awk 'NR>=1 && NR<=140 {printf "%5d  %s\n", NR, $0}' "$file" | sed -n '40,120p'

file2="src/internal/handler/handler.go"
echo "== $file2 =="
awk 'NR>=1 && NR<=120 {printf "%5d  %s\n", NR, $0}' "$file2" | sed -n '1,80p'

file3="src/internal/server/server.go"
echo "== $file3 =="
awk 'NR>=60 && NR<=130 {printf "%5d  %s\n", NR, $0}' "$file3"

Repository: EduIDE/EduIDE-shared-cache

Length of output: 6002


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Find Go tests mentioning Analyzer/isZIP/checkZIP
rg -n --hidden --glob '*_test.go' "Analyzer|isZIP|checkZIP|archive/zip|PK\x03\x04" . || true

# List any test files under internal/analysis
fd -t f --hidden --follow "*analysis*" . | head -n 200
fd -t f --hidden --follow "*_test.go" src/internal/analysis || true

# Show full analyzer.go line count already known, but ensure no other ZIP detection elsewhere
rg -n "func\s+isZIP|PK\\x03\\x04|isZIP\(" src/internal/analysis/analyzer.go

Repository: EduIDE/EduIDE-shared-cache

Length of output: 360


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Search for tests or fixtures mentioning analyzer/zip/jar/isZIP
rg -n --hidden --glob '*_test.go' "Analyzer|isZIP|checkZIP|archive/zip|PK\x03\x04|jar" . || true

echo "== Files in src/internal/analysis matching *_test.go =="
fd -t f --hidden -g "*_test.go" src/internal/analysis || true

echo "== Any non-test files referencing isZIP =="
rg -n --hidden "isZIP\(" src/internal/analysis || true

Repository: EduIDE/EduIDE-shared-cache

Length of output: 294


ZIP detection is too strict and can skip parseable ZIP/JAR inputs.

Line 58 gates ZIP scanning on isZIP, which only returns true for buffers starting with the local file header signature PK\x03\x04. For ZIP/JAR data that is still valid but doesn’t start with that signature, Analyzer.Check falls through to default and returns (nil, nil), causing silent skipping (no parse attempt). This is also inconsistent with the gzip-tar path, where .jar entries call checkZIP without an isZIP gate.

Suggested robustness diff
 import (
 	"archive/tar"
 	"archive/zip"
 	"bytes"
 	"compress/gzip"
+	"errors"
 	"fmt"
 	"io"
 	"strings"
@@
 func (a *Analyzer) Check(data []byte) ([]Violation, error) {
-	switch {
-	case isGzip(data):
+	if isGzip(data) {
 		return a.checkGzipTar(data)
-	case isZIP(data):
-		seen := make(map[string]struct{})
-		return a.checkZIP(data, seen)
-	default:
-		return nil, nil
 	}
+
+	seen := make(map[string]struct{})
+	v, err := a.checkZIP(data, seen)
+	if err != nil {
+		if errors.Is(err, zip.ErrFormat) {
+			return nil, nil // unknown/non-zip format
+		}
+		return nil, err
+	}
+	return v, nil
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
switch {
case isGzip(data):
return a.checkGzipTar(data)
case isZIP(data):
seen := make(map[string]struct{})
return a.checkZIP(data, seen)
default:
return nil, nil
}
if isGzip(data) {
return a.checkGzipTar(data)
}
seen := make(map[string]struct{})
v, err := a.checkZIP(data, seen)
if err != nil {
if errors.Is(err, zip.ErrFormat) {
return nil, nil // unknown/non-zip format
}
return nil, err
}
return v, nil
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/internal/analysis/analyzer.go` around lines 55 - 63, The ZIP detection is
too strict: in Analyzer.Check the switch currently only calls checkZIP when
isZIP(data) is true, causing valid ZIP/JAR payloads that lack the local file
header at the start to be skipped; instead, mirror the gzip path by removing the
isZIP gate and always attempt ZIP parsing for non-gzip data—create the same seen
map (seen := make(map[string]struct{})) and call a.checkZIP(data, seen) in the
non-gzip branch (keep the isGzip -> a.checkGzipTar(data) branch and the default
error handling), so checkZIP is attempted for any non-gzip input just like .jar
entries handled inside checkGzipTar.

}

// checkGzipTar decompresses a gzip stream and scans the tar entries inside.
// .class files are scanned directly; .jar files inside the tar are opened as
// ZIPs and their .class files scanned one level deep.
func (a *Analyzer) checkGzipTar(data []byte) ([]Violation, error) {
gr, err := gzip.NewReader(bytes.NewReader(data))
if err != nil {
return nil, fmt.Errorf("failed to open gzip stream: %w", err)
}
defer gr.Close()

tr := tar.NewReader(gr)
seen := make(map[string]struct{})
var violations []Violation

for {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("failed to read tar entry: %w", err)
}

entryData, err := io.ReadAll(tr)
if err != nil {
continue
}
Comment on lines +89 to +92

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Files ==="
ls -la src/internal/analysis/analyzer.go

echo "=== Locate io.ReadAll usage in analyzer.go ==="
rg -n "io\.ReadAll\(" src/internal/analysis/analyzer.go

echo "=== Show analyzer.go around reported lines (70-160) ==="
nl -ba src/internal/analysis/analyzer.go | sed -n '40,190p'

echo "=== Show analyzer.go around checkZIP / checkGzipTar definitions (if present) ==="
rg -n "func \(a \*Analyzer\) checkGzipTar|func \(a \*Analyzer\) checkZIP|func isGzip|func isZIP" src/internal/analysis/analyzer.go
nl -ba src/internal/analysis/analyzer.go | sed -n '1,260p'

echo "=== Check handler call sites for Analyzer.Check and whether they validate/limit upload sizes ==="
rg -n "Analyzer\.Check\(" -S src/internal | head
rg -n "cache_put\.go|cache_put\.go" -S src/internal/handler | head -n 50
nl -ba src/internal/handler/cache_put.go | sed -n '1,220p' || true

Repository: EduIDE/EduIDE-shared-cache

Length of output: 409


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Files ==="
ls -la src/internal/analysis/analyzer.go

echo "=== Locate io.ReadAll usage in analyzer.go ==="
rg -n "io\.ReadAll\(" src/internal/analysis/analyzer.go

echo "=== Show analyzer.go around reported lines (70-160) ==="
nl -ba src/internal/analysis/analyzer.go | sed -n '70,160p'

echo "=== Show analyzer.go around checkZIP / checkGzipTar definitions ==="
rg -n "checkGzipTar|checkZIP|scanClassBytes|isGzip|isZIP" src/internal/analysis/analyzer.go
nl -ba src/internal/analysis/analyzer.go | sed -n '1,240p'

echo "=== Check handler call sites for Analyzer.Check and whether they validate/limit upload sizes ==="
rg -n "Analyzer\.Check\(" -S src/internal | head -n 50
for f in $(fd -e go src/internal/handler | tr '\n' ' '); do :; done
if [ -f src/internal/handler/cache_put.go ]; then
  nl -ba src/internal/handler/cache_put.go | sed -n '1,260p'
fi

# Search for other upload size limits
rg -n "max.*(upload|request|body|bytes|size)" src/internal | head -n 50
rg -n "http\.MaxBytesReader|MaxBytesReader|io\.LimitReader|LimitedReader" src/internal | head -n 50

Repository: EduIDE/EduIDE-shared-cache

Length of output: 409


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Locate io.ReadAll usage in analyzer.go ==="
rg -n "io\.ReadAll\(" src/internal/analysis/analyzer.go

echo "=== Show analyzer.go around lines 60-160 ==="
nl -ba src/internal/analysis/analyzer.go | sed -n '55,160p'

echo "=== Show analyzer.go around lines 160-240 (to include ZIP/tar helpers if any) ==="
nl -ba src/internal/analysis/analyzer.go | sed -n '160,280p'

echo "=== Show relevant helper funcs (checkGzipTar, checkZIP, isGzip, isZIP) ==="
for pat in "checkGzipTar" "checkZIP" "isGzip" "isZIP"; do
  echo "--- matching $pat ---"
  rg -n "$pat" src/internal/analysis/analyzer.go
done

echo "=== Show handler cache_put.go that calls analyzer.Check ==="
if [ -f src/internal/handler/cache_put.go ]; then
  rg -n "Analyzer\.Check" src/internal/handler/cache_put.go
  nl -ba src/internal/handler/cache_put.go | sed -n '1,260p'
else
  echo "cache_put.go not found"
fi

echo "=== Search for size limits on uploads/bodies ==="
rg -n "MaxBytesReader|http\.MaxBytesReader|max.*upload|max.*request|max.*body|io\.LimitReader|LimitedReader" src/internal | head -n 100

Repository: EduIDE/EduIDE-shared-cache

Length of output: 300


🏁 Script executed:

[Cannot proceed: no verification evidence (tool outputs) was provided in this chat. I need the repository inspection results to rewrite the comment accurately.]

Repository: EduIDE/EduIDE-shared-cache

Length of output: 318


🏁 Script executed:

#!/bin/bash
set -euo pipefail

FILE="src/internal/analysis/analyzer.go"

echo "=== Line count ==="
wc -l "$FILE"

echo "=== analyzer.go: lines 1-120 ==="
sed -n '1,120p' "$FILE"

echo "=== analyzer.go: lines 120-200 ==="
sed -n '120,200p' "$FILE"

echo "=== analyzer.go: where helpers gate archive type ==="
rg -n "isGzip|isZIP|checkGzipTar|checkZIP|scanClassBytes" "$FILE"

echo "=== analyzer.go: any existing bounds/limits ==="
rg -n "LimitReader|MaxBytes|max.*(bytes|size)|io\.CopyN|ReadFull" "$FILE" || true

echo "=== Find handler(s) that call Analyzer.Check ==="
fd -e go "cache_put.go" src/internal/handler -t f || true
rg -n "Analyzer\.Check" src/internal/handler -S || true

echo "=== Show cache_put.go if present ==="
if [ -f "src/internal/handler/cache_put.go" ]; then
  sed -n '1,240p' "src/internal/handler/cache_put.go"
fi

echo "=== Search for request/body size limits in internal HTTP handlers/middleware ==="
rg -n "MaxBytesReader|http\.MaxBytesReader|max.*(upload|request|body)|LimitReader|LimitedReader" src/internal -S || true

Repository: EduIDE/EduIDE-shared-cache

Length of output: 9067


🏁 Script executed:

#!/bin/bash
set -euo pipefail
rg -n "maxEntrySize" -S src/internal | head -n 50
rg -n "maxEntrySize" -S . | head -n 50
fd -e go src/internal | wc -l

Repository: EduIDE/EduIDE-shared-cache

Length of output: 3677


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Search for maxEntrySize definition ==="
rg -n "maxEntrySize" -S src/internal

echo "=== Search for CacheHandler struct (maxEntrySize field) ==="
rg -n "type CacheHandler" -S src/internal/handler
rg -n "maxEntrySize" -S src/internal/handler

echo "=== Search for defaults / config wiring ==="
rg -n "StaticAnalysisConfig|CheckNetwork|CheckExec|CheckReflection|CheckFilesystem" -S src/internal | head -n 50
rg -n "max.*entry.*size|max.*Entry.*Size|max.*upload.*size" -S src/internal src | head -n 50

Repository: EduIDE/EduIDE-shared-cache

Length of output: 8019


Bound decompressed reads from untrusted archives.

src/internal/analysis/analyzer.go uses unbounded io.ReadAll on decompressed archive contents (io.ReadAll(tr) at ~89 for tar entries and io.ReadAll(rc) at ~132 for ZIP entries). While the HTTP handler caps the uploaded artifact size (maxEntrySize via io.ReadAll(io.LimitReader(...))), gzip/zip bombs can still expand to extremely large per-entry sizes and exhaust memory during analysis.

Additionally, isZIP only returns true for PK\x03\x04 at byte 0; otherwise Analyzer.Check falls back to (nil, nil), skipping static analysis for ZIP/JAR files that don’t match this narrow signature gate.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/internal/analysis/analyzer.go` around lines 89 - 92, Analyzer.Check
currently uses unbounded io.ReadAll on tar/zip entry readers (entryData :=
io.ReadAll(tr) and io.ReadAll(rc)), and isZIP only accepts the exact 4‑byte
sequence PK\x03\x04 which misses other valid ZIP/JAR signatures. Replace the
unbounded reads with bounded reads using io.ReadAll(io.LimitReader(tr,
maxEntrySize)) and io.ReadAll(io.LimitReader(rc, maxEntrySize)), and treat reads
that reach the limit as "entry too large" (skip or return a handled error) to
avoid memory blowups; reference the variables entryData, tr, rc and the existing
maxEntrySize constant. Also broaden isZIP to detect ZIP/JAR by checking for the
"PK" prefix (e.g., bytes.HasPrefix(buf, []byte("PK"))) rather than only
PK\x03\x04 so ZIP/JAR archives aren't incorrectly skipped; reference the isZIP
function and Analyzer.Check where it's used.


switch {
case strings.HasSuffix(hdr.Name, ".class"):
// Direct class file — common for compileJava task output.
className := strings.TrimSuffix(hdr.Name, ".class")
v := a.scanClassBytes(className, entryData, seen)
violations = append(violations, v...)

case strings.HasSuffix(hdr.Name, ".jar"):
// JAR inside the tar — common for the jar task output.
v, err := a.checkZIP(entryData, seen)
if err != nil {
continue
}
violations = append(violations, v...)
}
}

return violations, nil
}

// checkZIP opens data as a ZIP archive and scans every .class file inside.
func (a *Analyzer) checkZIP(data []byte, seen map[string]struct{}) ([]Violation, error) {
zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return nil, fmt.Errorf("failed to open ZIP: %w", err)
}

var violations []Violation

for _, f := range zr.File {
if !strings.HasSuffix(f.Name, ".class") {
continue
}

rc, err := f.Open()
if err != nil {
continue
}
classData, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}

className := strings.TrimSuffix(f.Name, ".class")
v := a.scanClassBytes(className, classData, seen)
violations = append(violations, v...)
}

return violations, nil
}

// scanClassBytes parses a single .class file and returns any violations.
// seen deduplicates across multiple calls (shared within one Check invocation).
func (a *Analyzer) scanClassBytes(className string, data []byte, seen map[string]struct{}) []Violation {
poolStrings, err := extractConstantPoolStrings(data)
if err != nil {
return nil
}

var violations []Violation

for _, s := range poolStrings {
for _, r := range a.rules {
for _, pattern := range r.patterns {
if !strings.HasPrefix(s, pattern) {
continue
}
key := className + "|" + s + "|" + r.category
if _, dup := seen[key]; dup {
continue
}
seen[key] = struct{}{}
violations = append(violations, Violation{
Class: className,
Matched: s,
Category: r.category,
})
}
}
}

return violations
}

// isGzip returns true when data begins with the gzip magic bytes.
func isGzip(data []byte) bool {
return len(data) >= 2 && data[0] == 0x1f && data[1] == 0x8b
}

// isZIP returns true when data begins with the ZIP local file header signature.
func isZIP(data []byte) bool {
return len(data) >= 4 &&
data[0] == 0x50 && data[1] == 0x4B &&
data[2] == 0x03 && data[3] == 0x04
}
77 changes: 77 additions & 0 deletions src/internal/analysis/classfile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package analysis

import (
"encoding/binary"
"fmt"
)

// extractConstantPoolStrings parses a Java class file and returns every UTF-8
// string from its constant pool. Class names, method names, field names, and
// type descriptors all appear there verbatim, so scanning these strings is
// sufficient to detect forbidden API references without executing any code.
func extractConstantPoolStrings(data []byte) ([]string, error) {
if len(data) < 10 {
return nil, fmt.Errorf("class file too short (%d bytes)", len(data))
}

// Validate magic bytes: 0xCAFEBABE
if data[0] != 0xCA || data[1] != 0xFE || data[2] != 0xBA || data[3] != 0xBE {
return nil, fmt.Errorf("missing class file magic bytes")
}

// Skip magic (4) + minor_version (2) + major_version (2)
pos := 8

if pos+2 > len(data) {
return nil, fmt.Errorf("truncated before constant_pool_count")
}
cpCount := int(binary.BigEndian.Uint16(data[pos : pos+2]))
pos += 2

var result []string

// Pool indices run from 1 to cpCount-1.
for i := 1; i < cpCount; i++ {
if pos >= len(data) {
return nil, fmt.Errorf("unexpected end of constant pool at entry %d", i)
}

tag := data[pos]
pos++

switch tag {
case 1: // CONSTANT_Utf8 — variable length string
if pos+2 > len(data) {
return nil, fmt.Errorf("truncated Utf8 length at entry %d", i)
}
length := int(binary.BigEndian.Uint16(data[pos : pos+2]))
pos += 2
if pos+length > len(data) {
return nil, fmt.Errorf("truncated Utf8 data at entry %d", i)
}
result = append(result, string(data[pos:pos+length]))
pos += length

case 3, 4: // CONSTANT_Integer, CONSTANT_Float
pos += 4

case 5, 6: // CONSTANT_Long, CONSTANT_Double — consume two pool slots
pos += 8
i++

case 7, 8, 16, 19, 20: // Class, String, MethodType, Module, Package
pos += 2

case 9, 10, 11, 12, 17, 18: // Fieldref, Methodref, InterfaceMethodref, NameAndType, Dynamic, InvokeDynamic
pos += 4

case 15: // CONSTANT_MethodHandle
pos += 3

default:
return nil, fmt.Errorf("unknown constant pool tag %d at entry %d", tag, i)
}
}

return result, nil
}
48 changes: 48 additions & 0 deletions src/internal/analysis/patterns.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package analysis

// Forbidden API prefix lists, grouped by category.
// HasPrefix matching is used so e.g. "java/net/Socket" also covers
// SocketInputStream, SocketOutputStream, etc. A trailing slash like "sun/net/"
// catches the entire package subtree.

var networkPatterns = []string{
"java/net/Socket",
"java/net/ServerSocket",
"java/net/DatagramSocket",
"java/net/MulticastSocket",
"java/net/URL",
"java/net/HttpURLConnection",
"java/net/InetAddress",
"java/net/InetSocketAddress",
"java/net/URLConnection",
"java/nio/channels/SocketChannel",
"java/nio/channels/ServerSocketChannel",
"java/nio/channels/DatagramChannel",
"sun/net/",
}

var execPatterns = []string{
"java/lang/Runtime",
"java/lang/ProcessBuilder",
"java/lang/Process",
}

var reflectionPatterns = []string{
"java/lang/reflect/Method",
"java/lang/reflect/Field",
"java/lang/reflect/Constructor",
"java/lang/ClassLoader",
"java/lang/invoke/MethodHandle",
"sun/misc/Unsafe",
}

var filesystemPatterns = []string{
"java/io/FileOutputStream",
"java/io/FileInputStream",
"java/io/FileWriter",
"java/io/FileReader",
"java/io/RandomAccessFile",
"java/nio/file/Files",
"java/nio/file/Path",
"java/nio/channels/FileChannel",
}
Loading
Loading