diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..1ef4d1d --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,125 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "development" ] + pull_request: + branches: [ "development" ] + schedule: + - cron: '23 23 * * 4' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - name: Run manual build steps + if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "/language:${{matrix.language}}" + + - name: Install jq + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y jq + + - name: Fail if CodeQL found issues + shell: bash + run: | + if [ -f /home/runner/work/DetectMateLibrary/results/python.sarif ]; then + SARIF=/home/runner/work/DetectMateLibrary/results/python.sarif + elif [ -f /home/runner/work/DetectMateLibrary/results/actions.sarif ]; then + SARIF=/home/runner/work/DetectMateLibrary/results/actions.sarif + else + echo "SARIF file not found. Analysis may have failed." + exit 1 + fi + count=$(jq '.runs[].results | length' $SARIF) + if [ "$count" -gt 0 ]; then + echo "CodeQL found $count issue(s)!" + jq -r '.runs[].results[] | "Rule: \(.ruleId)\nSeverity: \(.level)\nMessage: \(.message.text)\nFile: \(.locations[0].physicalLocation.artifactLocation.uri)\nLine: \(.locations[0].physicalLocation.region.startLine)\n---"' $SARIF + exit 1 + else + echo "No CodeQL issues found" + fi diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml new file mode 100644 index 0000000..f87565d --- /dev/null +++ b/.github/workflows/publish-docs.yml @@ -0,0 +1,40 @@ +name: Publish Docs +on: + push: + branches: [ "development" ] + release: + types: [published] + +jobs: + deploy: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up uv (Python 3.12) + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.12" + enable-cache: true + + - name: Install dependencies + run: uv pip install -e .[dev] + + - name: Configure Git user + run: | + git config --global user.name "${{ github.actor }}" + git config --global user.email "${{ github.actor }}@users.noreply.github.com" + + - name: Deploy docs + run: | + # Extract version from tag (remove 'v' prefix if present) + VERSION=${GITHUB_REF_NAME#v} + echo "Deploying version $VERSION" + uv run mike deploy --push --update-aliases $VERSION latest + # Optional: make the released version the default shown when visiting the site + # it's enough to run this once + # uv run mike set-default --push latest diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index e86f96b..e9727e9 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main"] + branches: [ "main", "development"] pull_request: permissions: @@ -30,7 +30,7 @@ jobs: run: uv run --dev prek run -a - name: Test with pytest - run: uv run --dev pytest + run: uv run --dev pytest -s # integration tests for DetectMateService - name: Checkout DetectMateService @@ -49,6 +49,6 @@ jobs: working-directory: DetectMateService run: uv pip install . - - name: Run DetectMateService library integration tests - working-directory: DetectMateService - run: pytest tests/library_integration + #- name: Run DetectMateService library integration tests + # working-directory: DetectMateService + # run: uv run --dev pytest tests/library_integration -s diff --git a/.gitignore b/.gitignore index 87ce953..3113ce1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ __pycache__/ # C extensions *.so +dummy* + # Distribution / packaging .Python build/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..95c372c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,102 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +DetectMateLibrary is a Python library for log processing and anomaly detection. It provides composable, stream-friendly components (parsers and detectors) that communicate via Protobuf-based schemas. The library is designed for both single-process and microservice deployments. + +## Development Commands + +```bash +# Install dependencies and pre-commit hooks +uv sync --dev +uv run prek install + +# Run tests +uv run pytest -q +uv run pytest -s # verbose with stdout +uv run pytest --cov=. --cov-report=term-missing # with coverage +uv run pytest tests/test_foo.py # single test file + +# Run linting/formatting (all pre-commit hooks) +uv run prek run -a + +# Recompile Protobuf (only if schemas.proto is modified) +protoc --proto_path=src/detectmatelibrary/schemas/ \ + --python_out=src/detectmatelibrary/schemas/ \ + src/detectmatelibrary/schemas/schemas.proto + +# Scaffold a new component workspace +mate create --type --name --dir +``` + +## Architecture + +### Data Flow + +``` +Raw Logs → Parser → ParserSchema → Detector → DetectorSchema (Alerts) +``` + +All data flows through typed Protobuf-backed schema objects. Components are stateful and support an optional training phase before detection. + +### Core Abstractions (`src/detectmatelibrary/common/`) + +- **`CoreComponent`** — base class managing buffering, ID generation, and training state + - **`CoreParser(CoreComponent)`** — parse raw logs into `ParserSchema` + - **`CoreDetector(CoreComponent)`** — detect anomalies in `ParserSchema`, emit `DetectorSchema` +- **`CoreConfig`** / **`CoreParserConfig`** / **`CoreDetectorConfig`** — Pydantic-based configuration hierarchy + +### Schema System (`src/detectmatelibrary/schemas/`) + +- `BaseSchema` wraps generated Protobuf messages with dict-like access (`schema["field"]`) +- Key schemas: `LogSchema`, `ParserSchema`, `DetectorSchema` +- Support serialization to/from bytes for transport and persistence + +### Buffering Modes (`src/detectmatelibrary/utils/data_buffer.py`) + +Three modes via `ArgsBuffer` config: +- **NO_BUF** — one item at a time (default) +- **BATCH** — accumulate N items, process as batch +- **WINDOW** — sliding window of size N + +### Implementations + +- **Parsers** (`src/detectmatelibrary/parsers/`): `JsonParser`, `DummyParser`, `TemplateMatcherParser` (uses Drain3 for template mining) +- **Detectors** (`src/detectmatelibrary/detectors/`): `NewValueDetector`, `NewValueComboDetector`, `RandomDetector`, `DummyDetector` +- **Utilities** (`src/detectmatelibrary/utils/`): `DataBuffer`, `EventPersistency`, `KeyExtractor`, `TimeFormatHandler`, `IdGenerator` + +## Extending the Library + +Implement a custom detector by subclassing `CoreDetector`: + +```python +class MyDetectorConfig(CoreDetectorConfig): + method_type: str = "my_detector" + my_param: int = 10 + +class MyDetector(CoreDetector): + def __init__(self, name="MyDetector", config=MyDetectorConfig()): + super().__init__(name=name, config=config) + + def train(self, input_: ParserSchema) -> None: + pass # optional + + def detect(self, input_: ParserSchema, output_: DetectorSchema) -> bool: + output_["detectorID"] = self.name + output_["score"] = 0.0 + return False # True = anomaly detected +``` + +Same pattern applies for `CoreParser` — implement `parse(input_: LogSchema, output_: ParserSchema) -> bool`. + +## Code Quality + +Pre-commit hooks enforce: +- **mypy** strict mode +- **flake8** linting, **autopep8** formatting (max line 110) +- **bandit** security checks, **vulture** dead-code detection (70% threshold) +- **docformatter** docstring style + +Python 3.12 is required (see `.python-version`). diff --git a/README.md b/README.md index 233054b..75442af 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,13 @@ Main library to run the different components in DetectMate. The library contains the next components: -* **Readers**: insert logs into the system. * **Parsers**: parse the logs receive from the reader. * **Detectors**: return alerts if anomalies are detected. -* **Outputs**: return alerts as outputs. * **Schemas**: standard data classes use in DetectMate. ``` -+---------+ +--------+ +-----------+ +--------+ -| Reader | --> | Parser | --> | Detector | --> | Output | -+---------+ +--------+ +-----------+ +--------+ + +--------+ +-----------+ + | Parser | --> | Detector | + +--------+ +-----------+ ``` ## Developer setup: @@ -22,7 +20,7 @@ The library contains the next components: Set up the dev environment and install pre-commit hooks: ```bash -uv pip install -e .[dev] +uv sync --dev uv run prek install ``` @@ -91,6 +89,7 @@ workspaces/custom_parser/ # workspace root │ └── custom_parser.py # CoreParser-based template ├── tests/ │ └── test_custom_parser.py # generated from template to test custom_parser +├── data.json # example data to run the code ├── LICENSE.md # copied from main project ├── .gitignore # copied from main project ├── .pre-commit-config.yaml # copied from main project diff --git a/config/NVD_audit_config.yaml b/config/NVD_audit_config.yaml new file mode 100644 index 0000000..1571cbb --- /dev/null +++ b/config/NVD_audit_config.yaml @@ -0,0 +1,40 @@ +detectors: + NewValueDetector: + auto_config: false + events: + 0: + NewValueDetector: + header_variables: + - pos: Type + params: {} + variables: + - name: var_2 + pos: 2 + - name: var_4 + pos: 4 + - name: var_5 + pos: 5 + - name: var_6 + pos: 6 + - name: var_7 + pos: 7 + - name: var_8 + pos: 8 + - name: var_9 + pos: 9 + 1: + NewValueDetector: + header_variables: + - pos: Type + params: {} + 2: + NewValueDetector: + params: {} + variables: + - name: var_3 + pos: 3 + method_type: new_value_detector + params: + data_use_training: null + parser: + start_id: 10 diff --git a/config/pipeline_config_default.yaml b/config/pipeline_config_default.yaml index 1ce2c62..0475495 100644 --- a/config/pipeline_config_default.yaml +++ b/config/pipeline_config_default.yaml @@ -1,15 +1,8 @@ -readers: - File_reader: - method_type: log_file_reader - auto_config: False - params: - file: local/miranda.json - parsers: MatcherParser: method_type: matcher_parser auto_config: False - log_format: "type= msg=audit(