diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..1ef4d1d --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,125 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL Advanced" + +on: + push: + branches: [ "development" ] + pull_request: + branches: [ "development" ] + schedule: + - cron: '23 23 * * 4' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: actions + build-mode: none + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Add any setup steps before running the `github/codeql-action/init` action. + # This includes steps like installing compilers or runtimes (`actions/setup-node` + # or others). This is typically only required for manual builds. + # - name: Setup runtime (example) + # uses: actions/setup-example@v1 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - name: Run manual build steps + if: matrix.build-mode == 'manual' + shell: bash + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "/language:${{matrix.language}}" + + - name: Install jq + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y jq + + - name: Fail if CodeQL found issues + shell: bash + run: | + if [ -f /home/runner/work/DetectMateLibrary/results/python.sarif ]; then + SARIF=/home/runner/work/DetectMateLibrary/results/python.sarif + elif [ -f /home/runner/work/DetectMateLibrary/results/actions.sarif ]; then + SARIF=/home/runner/work/DetectMateLibrary/results/actions.sarif + else + echo "SARIF file not found. Analysis may have failed." + exit 1 + fi + count=$(jq '.runs[].results | length' $SARIF) + if [ "$count" -gt 0 ]; then + echo "CodeQL found $count issue(s)!" + jq -r '.runs[].results[] | "Rule: \(.ruleId)\nSeverity: \(.level)\nMessage: \(.message.text)\nFile: \(.locations[0].physicalLocation.artifactLocation.uri)\nLine: \(.locations[0].physicalLocation.region.startLine)\n---"' $SARIF + exit 1 + else + echo "No CodeQL issues found" + fi diff --git a/docs/installation.md b/docs/installation.md index 63a1cd3..2f04fc8 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -16,6 +16,12 @@ uv sync **Result**: the package is installed into the active Python environment and changes to the source tree are reflected immediately. +To install it in a different venv as a library: + +```bash +uv pip install --no-cache-dir +``` + ## Developer setup **Purpose**: prepare a development environment with test and lint tooling. diff --git a/docs/schemas.md b/docs/schemas.md index 7528133..4d418ca 100644 --- a/docs/schemas.md +++ b/docs/schemas.md @@ -122,7 +122,7 @@ from detectmatelibrary import schemas kwargs = load_somewhere() # load the dict kwargs["log"] = "Test log" -log_schema = LogSchema(kwargs) +log_schema = schemas.LogSchema(kwargs) print(log_schema.log == "Test log") # True ``` @@ -132,11 +132,11 @@ print(log_schema.log == "Test log") # True ```python from detectmatelibrary import schemas -log_schema = LogSchema() +log_schema = schemas.LogSchema() log_schema.log = "Test log" print(log_schema["log"] == log_schema.log) # True -log_schema2 = LogSchema() +log_schema2 = schemas.LogSchema() print(log_schema == log_schema2) # False log_schema2.log = "Test log" @@ -148,12 +148,12 @@ print(log_schema == log_schema2) # True ```python from detectmatelibrary import schemas -log_schema = LogSchema() +log_schema = schemas.LogSchema() log_schema.log = "Test log" serialized = log_schema.serialize() print(isinstance(serialized, bytes)) # True -new_log_schema = LogSchema() +new_log_schema = schemas.LogSchema() new_log_schema.deserialize(serialized) print(new_log_schema.schema_id == log_schema.schema_id) # True ``` diff --git a/pyproject.toml b/pyproject.toml index 0610d34..edd0e0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,14 +3,16 @@ name = "detectmatelibrary" version = "0.1.0" description = "DetectMate Library for log processing components" readme = "README.md" +dynamic = ["authors"] requires-python = ">=3.12" dependencies = [ - "drain3>=0.9.11", "protobuf>=6.32.1", "pydantic>=2.11.7", "pyyaml>=6.0.3", "regex>=2025.11.3", "numpy>=2.3.2", + "pandas>=2.3.2", + "polars>=1.38.1", ] [dependency-groups] @@ -21,8 +23,6 @@ dev = [ "prek>=0.2.8", "pytest>=8.4.2", "pytest-cov>=6.2.1", - "pandas>=2.3.2", - "polars>=1.38.1", ] [build-system] diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4e86b68 --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup, find_packages +import tomllib + + +def gather_dependencies(toml_path: str = "pyproject.toml") -> list[str]: + with open(toml_path, "rb") as f: + data = tomllib.load(f) + + # Try Poetry first + poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {}) + if poetry_deps: + return [f"{dep}{version}" for dep, version in poetry_deps.items()] + + # Fall back to PEP 621 + project_deps: list[str] = data.get("project", {}).get("dependencies", []) + return project_deps + + +setup( + name="detectmatelibrary", + version="0.1.0", + package_dir={"": "src"}, + packages=find_packages(where="src"), + description="DetectMate Library for log processing components", + author="voice", + author_email="voice@example.com", + install_requires=gather_dependencies(), +) diff --git a/src/detectmatelibrary/common/_config/__init__.py b/src/detectmatelibrary/common/_config/__init__.py index 297201c..7c0265b 100644 --- a/src/detectmatelibrary/common/_config/__init__.py +++ b/src/detectmatelibrary/common/_config/__init__.py @@ -1,5 +1,5 @@ -from detectmatelibrary.common._config._compile import ConfigMethods, generate_detector_config -from detectmatelibrary.common._config._formats import EventsConfig +from ._compile import ConfigMethods, generate_detector_config +from ._formats import EventsConfig __all__ = ["ConfigMethods", "generate_detector_config", "EventsConfig", "BasicConfig"] @@ -59,6 +59,7 @@ def to_dict(self, method_id: str) -> Dict[str, Any]: # Collect all non-meta fields for params params = {} events_data = None + instances_data = None for field_name, field_value in self: # Skip meta fields @@ -72,6 +73,13 @@ def to_dict(self, method_id: str) -> Dict[str, Any]: events_data = field_value.to_dict() else: events_data = field_value + # Handle global instances specially (top-level, not in params) + # Serialized as "global" in YAML (Python field is "global_instances") + elif field_name == "global_instances" and field_value: + instances_data = { + name: inst.to_dict() + for name, inst in field_value.items() + } else: # All other fields go into params params[field_name] = field_value @@ -80,6 +88,10 @@ def to_dict(self, method_id: str) -> Dict[str, Any]: if params: result["params"] = params + # Add global instances if they exist (serialized as "global" in YAML) + if instances_data is not None: + result["global"] = instances_data + # Add events if they exist if events_data is not None: result["events"] = events_data diff --git a/src/detectmatelibrary/common/_config/_compile.py b/src/detectmatelibrary/common/_config/_compile.py index d61ce8d..5198629 100644 --- a/src/detectmatelibrary/common/_config/_compile.py +++ b/src/detectmatelibrary/common/_config/_compile.py @@ -1,4 +1,4 @@ -from detectmatelibrary.common._config._formats import EventsConfig +from detectmatelibrary.common._config._formats import EventsConfig, _EventInstance from typing import Any, Dict, List, Sequence, Tuple, Union import warnings @@ -93,8 +93,9 @@ def check_type(config: Dict[str, Any], method_type: str) -> None: def process(config: Dict[str, Any]) -> Dict[str, Any]: has_params = "params" in config has_events = "events" in config + has_instances = "global" in config - if not has_params and not has_events and not config.get("auto_config", False): + if not has_params and not has_events and not has_instances and not config.get("auto_config", False): warnings.warn(MissingParamsWarning()) if has_params: @@ -108,11 +109,19 @@ def process(config: Dict[str, Any]) -> Dict[str, Any]: if has_events: config["events"] = EventsConfig._init(config["events"]) + # Handle "global" key: event-ID-independent global instances + # Renamed to "global_instances" to avoid collision with Python keyword + if has_instances: + config["global_instances"] = { + name: _EventInstance._init(**data) + for name, data in config.pop("global").items() + } + return config def generate_detector_config( - variable_selection: Dict[int, List[Union[str, Tuple[str, ...]]]], + variable_selection: Dict[int | str, List[Union[str, Tuple[str, ...]]]], detector_name: str, method_type: str, **additional_params: Any @@ -158,7 +167,7 @@ def generate_detector_config( """ var_pattern = re.compile(r"^var_(\d+)$") - events_config: Dict[int, Dict[str, Any]] = {} + events_config: Dict[int | str, Dict[str, Any]] = {} for event_id, variable_names in variable_selection.items(): instances: Dict[str, Any] = {} diff --git a/src/detectmatelibrary/common/core.py b/src/detectmatelibrary/common/core.py index 5f6e274..17eaa9c 100644 --- a/src/detectmatelibrary/common/core.py +++ b/src/detectmatelibrary/common/core.py @@ -53,9 +53,25 @@ def describe(self) -> str: return descriptions[self.value] +class ConfigState(Enum): + DEFAULT = 0 + STOP_CONFIGURE = 1 + KEEP_CONFIGURE = 2 + + def describe(self) -> str: + descriptions = [ + "Follow default configuration behavior.", + "Force stop configuration.", + "Keep configuring regardless of default behavior." + ] + + return descriptions[self.value] + + class CoreConfig(BasicConfig): start_id: int = 10 data_use_training: int | None = None + data_use_configure: int | None = None def do_training(config: CoreConfig, index: int, train_state: TrainState) -> bool: @@ -67,6 +83,15 @@ def do_training(config: CoreConfig, index: int, train_state: TrainState) -> bool return config.data_use_training is not None and config.data_use_training > index +def do_configure(config: CoreConfig, index: int, configure_state: ConfigState) -> bool: + if configure_state == ConfigState.STOP_CONFIGURE: + return False + elif configure_state == ConfigState.KEEP_CONFIGURE: + return True + + return config.data_use_configure is not None and config.data_use_configure > index + + class CoreComponent: """Base class for all components in the system.""" def __init__( @@ -86,6 +111,9 @@ def __init__( self.id_generator = SimpleIDGenerator(self.config.start_id) self.data_used_train = 0 self.train_state: TrainState = TrainState.DEFAULT + self.data_used_configure = 0 + self.configure_state: ConfigState = ConfigState.DEFAULT + self._configuration_done = False def __repr__(self) -> str: return f"<{self.type_}> {self.name}: {self.config}" @@ -100,6 +128,14 @@ def train( ) -> None: pass + def configure( + self, input_: List[BaseSchema] | BaseSchema, + ) -> None: + pass + + def set_configuration(self) -> None: + pass + def process(self, data: BaseSchema | bytes) -> BaseSchema | bytes | None: is_byte, data = SchemaPipeline.preprocess(self.input_schema(), data) logger.debug(f"<<{self.name}>> received:\n{data}") @@ -107,10 +143,25 @@ def process(self, data: BaseSchema | bytes) -> BaseSchema | bytes | None: if (data_buffered := self.data_buffer.add(data)) is None: # type: ignore return None - if do_training(config=self.config, index=self.data_used_train, train_state=self.train_state): - self.data_used_train += 1 - logger.info(f"<<{self.name}>> use data for training") - self.train(input_=data_buffered) + if do_configure( + config=self.config, + index=self.data_used_configure, + configure_state=self.configure_state + ): + self.data_used_configure += 1 + logger.info(f"<<{self.name}>> use data for configuration") + self.configure(input_=data_buffered) + return None + else: + if self.data_used_configure > 0 and not self._configuration_done: + self._configuration_done = True + logger.info(f"<<{self.name}>> finalizing configuration") + self.set_configuration() + + if do_training(config=self.config, index=self.data_used_train, train_state=self.train_state): + self.data_used_train += 1 + logger.info(f"<<{self.name}>> use data for training") + self.train(input_=data_buffered) output_ = self.output_schema() logger.info(f"<<{self.name}>> processing data") diff --git a/src/detectmatelibrary/common/detector.py b/src/detectmatelibrary/common/detector.py index 18b67b9..331eae2 100644 --- a/src/detectmatelibrary/common/detector.py +++ b/src/detectmatelibrary/common/detector.py @@ -1,4 +1,4 @@ -from detectmatelibrary.common._config._formats import EventsConfig +from detectmatelibrary.common._config._formats import EventsConfig, _EventInstance from detectmatelibrary.common.core import CoreComponent, CoreConfig from detectmatelibrary.utils.data_buffer import ArgsBuffer, BufferMode @@ -9,18 +9,18 @@ from typing_extensions import override from typing import Dict, List, Optional, Any +from detectmatelibrary.utils.time_format_handler import TimeFormatHandler + + +_time_handler = TimeFormatHandler() + def _extract_timestamp( input_: List[ParserSchema] | ParserSchema ) -> List[int]: - def format_time(time: str) -> int: - time_ = time.split(":")[0] - return int(float(time_)) - if not isinstance(input_, list): input_ = [input_] - - return [format_time(i["logFormatVariables"]["Time"]) for i in input_] + return [int(_time_handler.parse_timestamp(i["logFormatVariables"]["Time"])) for i in input_] def _extract_logIDs( @@ -68,6 +68,27 @@ def get_configured_variables( return result +def get_global_variables( + input_: ParserSchema, + global_instances: Dict[str, _EventInstance], +) -> Dict[str, Any]: + """Extract header variables from event-ID-independent instances. + + Args: + input_: Parser schema containing logFormatVariables + global_instances: Dict of instance_name -> _EventInstance configs + + Returns: + Dict mapping variable names to their values from the input + """ + result: Dict[str, Any] = {} + for instance in global_instances.values(): + for name in instance.header_variables: + if name in input_["logFormatVariables"]: + result[name] = input_["logFormatVariables"][name] + return result + + class CoreDetectorConfig(CoreConfig): comp_type: str = "detectors" method_type: str = "core_detector" @@ -125,3 +146,13 @@ def train( self, input_: ParserSchema | list[ParserSchema] # type: ignore ) -> None: pass + + @override + def configure( + self, input_: ParserSchema | list[ParserSchema] # type: ignore + ) -> None: + pass + + @override + def set_configuration(self) -> None: + pass diff --git a/src/detectmatelibrary/constants.py b/src/detectmatelibrary/constants.py index e4a2dd7..6f81780 100644 --- a/src/detectmatelibrary/constants.py +++ b/src/detectmatelibrary/constants.py @@ -2,3 +2,4 @@ TIMESTAMP = "Time" EVENT_TEMPLATE = "EventTemplate" EVENT_ID = "EventId" +GLOBAL_EVENT_ID = "*" diff --git a/src/detectmatelibrary/detectors/new_value_combo_detector.py b/src/detectmatelibrary/detectors/new_value_combo_detector.py index 83c9af5..0445863 100644 --- a/src/detectmatelibrary/detectors/new_value_combo_detector.py +++ b/src/detectmatelibrary/detectors/new_value_combo_detector.py @@ -1,7 +1,12 @@ from detectmatelibrary.common._config import generate_detector_config -from detectmatelibrary.common._config._formats import EventsConfig +from detectmatelibrary.common._config._formats import EventsConfig, _EventInstance -from detectmatelibrary.common.detector import CoreDetectorConfig, CoreDetector, get_configured_variables +from detectmatelibrary.common.detector import ( + CoreDetectorConfig, + CoreDetector, + get_configured_variables, + get_global_variables +) from detectmatelibrary.utils.data_buffer import BufferMode from detectmatelibrary.utils.persistency.event_data_structures.trackers import ( @@ -10,6 +15,7 @@ from detectmatelibrary.utils.persistency.event_persistency import EventPersistency from detectmatelibrary.schemas import ParserSchema, DetectorSchema +from detectmatelibrary.constants import GLOBAL_EVENT_ID from typing import Any, Dict, Sequence, cast, Tuple from itertools import combinations @@ -48,6 +54,7 @@ class NewValueComboDetectorConfig(CoreDetectorConfig): method_type: str = "new_value_combo_detector" events: EventsConfig | dict[str, Any] = {} + global_instances: Dict[str, _EventInstance] = {} comb_size: int = 2 @@ -85,6 +92,14 @@ def train(self, input_: ParserSchema) -> None: # type: ignore event_template=input_["template"], named_variables=configured_variables ) + if config.global_instances: + global_vars = get_global_variables(input_, config.global_instances) + if global_vars: + self.persistency.ingest_event( + event_id=GLOBAL_EVENT_ID, + event_template=input_["template"], + named_variables=global_vars + ) def detect( self, input_: ParserSchema, output_: DetectorSchema # type: ignore @@ -110,6 +125,18 @@ def detect( ) overall_score += 1.0 + if config.global_instances and GLOBAL_EVENT_ID in known_events: + global_vars = get_global_variables(input_, config.global_instances) + global_combo_dict = get_combo(global_vars) + global_tracker = known_events[GLOBAL_EVENT_ID] + for combo_key, multi_tracker in global_tracker.get_data().items(): + value_tuple = global_combo_dict.get(combo_key) + if value_tuple is None: + continue + if value_tuple not in multi_tracker.unique_set: + alerts[f"Global - {combo_key}"] = f"Unknown value combination: {value_tuple}" + overall_score += 1.0 + if overall_score > 0: output_["score"] = overall_score output_["description"] = ( @@ -120,7 +147,7 @@ def detect( return True return False - def configure(self, input_: ParserSchema) -> None: + def configure(self, input_: ParserSchema) -> None: # type: ignore """Configure the detector based on the stability of individual variables, then learn value combinations based on that configuration.""" diff --git a/src/detectmatelibrary/detectors/new_value_detector.py b/src/detectmatelibrary/detectors/new_value_detector.py index dad5ecd..eab00c6 100644 --- a/src/detectmatelibrary/detectors/new_value_detector.py +++ b/src/detectmatelibrary/detectors/new_value_detector.py @@ -1,8 +1,12 @@ from detectmatelibrary.common._config._compile import generate_detector_config -from detectmatelibrary.common._config._formats import EventsConfig - -from detectmatelibrary.common.detector import CoreDetectorConfig, CoreDetector, get_configured_variables +from detectmatelibrary.common._config._formats import EventsConfig, _EventInstance +from detectmatelibrary.common.detector import ( + CoreDetectorConfig, + CoreDetector, + get_configured_variables, + get_global_variables +) from detectmatelibrary.utils.persistency.event_data_structures.trackers.stability.stability_tracker import ( EventStabilityTracker ) @@ -10,14 +14,16 @@ from detectmatelibrary.utils.data_buffer import BufferMode from detectmatelibrary.schemas import ParserSchema, DetectorSchema +from detectmatelibrary.constants import GLOBAL_EVENT_ID -from typing import Any +from typing import Any, Dict class NewValueDetectorConfig(CoreDetectorConfig): method_type: str = "new_value_detector" events: EventsConfig | dict[str, Any] = {} + global_instances: Dict[str, _EventInstance] = {} class NewValueDetector(CoreDetector): @@ -50,6 +56,14 @@ def train(self, input_: ParserSchema) -> None: # type: ignore event_template=input_["template"], named_variables=configured_variables ) + if self.config.global_instances: + global_vars = get_global_variables(input_, self.config.global_instances) + if global_vars: + self.persistency.ingest_event( + event_id=GLOBAL_EVENT_ID, + event_template=input_["template"], + named_variables=global_vars + ) def detect( self, input_: ParserSchema, output_: DetectorSchema # type: ignore @@ -74,6 +88,17 @@ def detect( ) overall_score += 1.0 + if self.config.global_instances and GLOBAL_EVENT_ID in known_events: + global_vars = get_global_variables(input_, self.config.global_instances) + global_tracker = known_events[GLOBAL_EVENT_ID] + for var_name, multi_tracker in global_tracker.get_data().items(): + value = global_vars.get(var_name) + if value is None: + continue + if value not in multi_tracker.unique_set: + alerts[f"Global - {var_name}"] = f"Unknown value: '{value}'" + overall_score += 1.0 + if overall_score > 0: output_["score"] = overall_score output_["description"] = f"{self.name} detects values not encountered in training as anomalies." @@ -82,7 +107,7 @@ def detect( return False - def configure(self, input_: ParserSchema) -> None: + def configure(self, input_: ParserSchema) -> None: # type: ignore self.auto_conf_persistency.ingest_event( event_id=input_["EventID"], event_template=input_["template"], diff --git a/src/detectmatelibrary/schemas/__init__.py b/src/detectmatelibrary/schemas/__init__.py index 9e91517..a960404 100644 --- a/src/detectmatelibrary/schemas/__init__.py +++ b/src/detectmatelibrary/schemas/__init__.py @@ -3,7 +3,7 @@ -from detectmatelibrary.schemas._classes import ( +from ._classes import ( BaseSchema, LogSchema, ParserSchema, @@ -13,4 +13,11 @@ ) -__all__ = ["BaseSchema", "LogSchema", "ParserSchema", "DetectorSchema", "OutputSchema", "FieldNotFound"] +__all__ = [ + "BaseSchema", + "LogSchema", + "ParserSchema", + "DetectorSchema", + "OutputSchema", + "FieldNotFound" +] diff --git a/src/detectmatelibrary/utils/persistency/event_persistency.py b/src/detectmatelibrary/utils/persistency/event_persistency.py index 42719d6..c21cb76 100644 --- a/src/detectmatelibrary/utils/persistency/event_persistency.py +++ b/src/detectmatelibrary/utils/persistency/event_persistency.py @@ -25,15 +25,15 @@ def __init__( *, event_data_kwargs: Optional[dict[str, Any]] = None, ): - self.events_data: Dict[int, EventDataStructure] = {} + self.events_data: Dict[int | str, EventDataStructure] = {} self.event_data_class = event_data_class self.event_data_kwargs = event_data_kwargs or {} self.variable_blacklist = variable_blacklist or [] - self.event_templates: Dict[int, str] = {} + self.event_templates: Dict[int | str, str] = {} def ingest_event( self, - event_id: int, + event_id: int | str, event_template: str, variables: list[Any] = [], named_variables: Dict[str, Any] = {} @@ -52,12 +52,12 @@ def ingest_event( data = data_structure.to_data(all_variables) data_structure.add_data(data) - def get_event_data(self, event_id: int) -> Any | None: + def get_event_data(self, event_id: int | str) -> Any | None: """Retrieve the data for a specific event ID.""" data_structure = self.events_data.get(event_id) return data_structure.get_data() if data_structure is not None else None - def get_events_data(self) -> Dict[int, EventDataStructure]: + def get_events_data(self) -> Dict[int | str, EventDataStructure]: """Retrieve the events data that is currently stored. Returns: @@ -77,11 +77,11 @@ def get_events_data(self) -> Dict[int, EventDataStructure]: """ return self.events_data - def get_event_template(self, event_id: int) -> str | None: + def get_event_template(self, event_id: int | str) -> str | None: """Retrieve the template for a specific event ID.""" return self.event_templates.get(event_id) - def get_event_templates(self) -> Dict[int, str]: + def get_event_templates(self) -> Dict[int | str, str]: """Retrieve all event templates.""" return self.event_templates @@ -107,7 +107,7 @@ def get_all_variables( }) return all_vars - def __getitem__(self, event_id: int) -> EventDataStructure | None: + def __getitem__(self, event_id: int | str) -> EventDataStructure | None: return self.events_data.get(event_id) def __repr__(self) -> str: diff --git a/src/detectmatelibrary/utils/time_format_handler.py b/src/detectmatelibrary/utils/time_format_handler.py index 5f2fcd0..66ed748 100644 --- a/src/detectmatelibrary/utils/time_format_handler.py +++ b/src/detectmatelibrary/utils/time_format_handler.py @@ -21,8 +21,10 @@ class TimeFormatHandler: "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%d/%b/%Y:%H:%M:%S %z", # Apache style: 10/Oct/2000:13:55:36 -0700 + "%d/%b/%Y:%H:%M:%S", # Apache style without timezone "%b %d %H:%M:%S", # syslog without year "%H:%M:%S", + "%A, %B %d, %Y %H:%M:%S", # "Wednesday, March 4, 2026 14:18:00" ] def __init__(self) -> None: diff --git a/tests/test_common/test_config_roundtrip.py b/tests/test_common/test_config_roundtrip.py index 85517e1..764b668 100644 --- a/tests/test_common/test_config_roundtrip.py +++ b/tests/test_common/test_config_roundtrip.py @@ -1,8 +1,9 @@ """Test that YAML -> Pydantic -> YAML is preserved (round-trip test).""" from detectmatelibrary.common._config import BasicConfig -from detectmatelibrary.common._config._formats import EventsConfig +from detectmatelibrary.common._config._formats import EventsConfig, _EventInstance +from typing import Dict import yaml @@ -20,6 +21,7 @@ class MockupDetectorConfig(BasicConfig): auto_config: bool = False parser: str = "" events: EventsConfig | None = None + global_instances: Dict[str, _EventInstance] = {} def load_test_config() -> dict: @@ -233,6 +235,40 @@ def test_true_roundtrip_preservation(self): # The two dicts should be identical assert dict1 == dict2 + def test_global_instance_roundtrip(self): + """Test that a detector config with a global instance round-trips + correctly.""" + config_yaml = load_test_config() + method_id = "detector_global_instance" + + # Load from YAML + config = MockupDetectorConfig.from_dict(config_yaml, method_id) + + # global_instances must be populated + assert "global_monitor" in config.global_instances + instance = config.global_instances["global_monitor"] + assert "Level" in instance.header_variables + assert "Time" in instance.header_variables + + # Convert back to dict + result_dict = config.to_dict(method_id) + result = result_dict["detectors"][method_id] + + # Serialised as "global" key + assert "global" in result + assert "global_monitor" in result["global"] + assert "header_variables" in result["global"]["global_monitor"] + hv_positions = [hv["pos"] for hv in result["global"]["global_monitor"]["header_variables"]] + assert "Level" in hv_positions + assert "Time" in hv_positions + + # True round-trip: yaml -> pydantic -> yaml -> pydantic + config2 = MockupDetectorConfig.from_dict(result_dict, method_id) + dict2 = config2.to_dict(method_id) + + assert config.global_instances.keys() == config2.global_instances.keys() + assert result_dict == dict2 + def test_parser_true_roundtrip(self): """Test parser yaml -> pydantic -> yaml -> pydantic roundtrip.""" config_yaml = load_test_config() diff --git a/tests/test_common/test_core.py b/tests/test_common/test_core.py index 018412b..a7b5220 100644 --- a/tests/test_common/test_core.py +++ b/tests/test_common/test_core.py @@ -1,4 +1,4 @@ -from detectmatelibrary.common.core import CoreConfig, CoreComponent, TrainState +from detectmatelibrary.common.core import CoreConfig, CoreComponent, TrainState, ConfigState from detectmatelibrary.common._config import BasicConfig from detectmatelibrary.utils.data_buffer import ArgsBuffer @@ -27,7 +27,8 @@ class MockConfigWithTraining(CoreConfig): "comp_type": "default_type", "auto_config": False, "start_id": 10, - "data_use_training": None + "data_use_training": None, + "data_use_configure": None } @@ -52,6 +53,54 @@ def run(self, input_, output_) -> None: return False +class MockConfigWithConfigure(CoreConfig): + thresholds: float = 0.7 + max_iter: int = 50 + data_use_configure: int | None = 3 + + +class MockComponentWithConfigure(CoreComponent): + def __init__( + self, name: str, config: MockConfigWithConfigure = MockConfigWithConfigure() + ) -> None: + super().__init__( + name=name, type_="Dummy", config=config, input_schema=schemas.LogSchema + ) + self.configure_data: list = [] + self.set_configuration_called: int = 0 + + def configure(self, input_) -> None: + self.configure_data.append(input_) + + def set_configuration(self) -> None: + self.set_configuration_called += 1 + + def run(self, input_, output_) -> bool: + return False + + +class MockComponentWithConfigureAndTraining(CoreComponent): + def __init__(self, name: str, config: CoreConfig = CoreConfig()) -> None: + super().__init__( + name=name, type_="Dummy", config=config, input_schema=schemas.LogSchema + ) + self.configure_data: list = [] + self.train_data: list = [] + self.set_configuration_called: int = 0 + + def configure(self, input_) -> None: + self.configure_data.append(input_) + + def set_configuration(self) -> None: + self.set_configuration_called += 1 + + def train(self, input_) -> None: + self.train_data.append(input_) + + def run(self, input_, output_) -> bool: + return False + + class DummyComponentWithBuffer(CoreComponent): def __init__(self, name: str, config: MockConfig = MockConfig()) -> None: super().__init__( @@ -216,3 +265,67 @@ def test_training_keep_training(self) -> None: ) assert len(component.train_data) == 10 + + def _make_log(self, i: int) -> schemas.LogSchema: + return schemas.LogSchema({ + "__version__": "1.0.0", + "logID": str(i), + "logSource": "test", + "hostname": "test_hostname" + }) + + def test_configuration(self) -> None: + component = MockComponentWithConfigure(name="DummyCfg1") + + results = [component.process(self._make_log(i)) for i in range(10)] + + assert component.data_used_configure == 3 + assert len(component.configure_data) == 3 + assert all(r is None for r in results[:3]) + assert component.set_configuration_called == 1 + + def test_configuration_returns_none_during_configure(self) -> None: + component = MockComponentWithConfigure(name="DummyCfg2") + + results = [component.process(self._make_log(i)) for i in range(3)] + + assert all(r is None for r in results) + + def test_configuration_force_stop(self) -> None: + component = MockComponentWithConfigure(name="DummyCfg3") + component.configure_state = ConfigState.STOP_CONFIGURE + + for i in range(10): + component.process(self._make_log(i)) + + assert len(component.configure_data) == 0 + assert component.set_configuration_called == 0 + + def test_configuration_keep_configure(self) -> None: + component = MockComponentWithConfigure(name="DummyCfg4") + component.configure_state = ConfigState.KEEP_CONFIGURE + + for i in range(10): + component.process(self._make_log(i)) + + assert len(component.configure_data) == 10 + assert component.set_configuration_called == 0 + + def test_configuration_before_training(self) -> None: + config = CoreConfig(data_use_configure=2, data_use_training=3) + component = MockComponentWithConfigureAndTraining(name="DummyCfg5", config=config) + + for i in range(10): + component.process(self._make_log(i)) + + assert len(component.configure_data) == 2 + assert len(component.train_data) == 3 + assert component.set_configuration_called == 1 + + def test_set_configuration_called_once(self) -> None: + component = MockComponentWithConfigure(name="DummyCfg6") + + for i in range(component.config.data_use_configure + 5): # type: ignore[operator] + component.process(self._make_log(i)) + + assert component.set_configuration_called == 1 diff --git a/tests/test_common/test_core_detector.py b/tests/test_common/test_core_detector.py index d2226ed..226b45f 100644 --- a/tests/test_common/test_core_detector.py +++ b/tests/test_common/test_core_detector.py @@ -79,7 +79,7 @@ def detect(self, input_, output_): "parsedLogID": "22", "parserID": "test", "log": "This is a parsed log.", - "logFormatVariables": {"Time": "12121.12:20"}, + "logFormatVariables": {"Time": "12121.12"}, } diff --git a/tests/test_common/test_extract_timestamp.py b/tests/test_common/test_extract_timestamp.py new file mode 100644 index 0000000..77196e5 --- /dev/null +++ b/tests/test_common/test_extract_timestamp.py @@ -0,0 +1,52 @@ +from detectmatelibrary.common.detector import _extract_timestamp +import detectmatelibrary.schemas as schemas + + +class TestCoreDetector: + def test_various_time_formats(self) -> None: + """Test that _extract_timestamp handles a wide range of realistic time + formats.""" + dummy_schema = { + "parserType": "a", + "EventID": 0, + "template": "asd", + "variables": [""], + "logID": "0", + "parsedLogID": "22", + "parserID": "test", + "log": "This is a parsed log.", + "logFormatVariables": {"Time": "12121"}, + } + # Compute expected value for timezone-naive formats at runtime + EXPECTED_UTC = 1772633880 + test_cases = [ + ("0", 0), + ("1772812294", 1772812294), + ("1772812294.5", 1772812294), + # Apache/nginx format + ("04/Mar/2026:14:18:00 +0000", EXPECTED_UTC), + ("04/Mar/2026:14:18:00", EXPECTED_UTC), + # ISO 8601 formats + ("2026-03-04T14:18:00+00:00", EXPECTED_UTC), + ("2026-03-04T14:18:00Z", EXPECTED_UTC), + ("2026-03-04T14:18:00.000Z", EXPECTED_UTC), + ("2026-03-04T14:18:00", EXPECTED_UTC), + # Space-separated + ("2026-03-04 14:18:00", EXPECTED_UTC), + ("2026-03-04 14:18:00.000", EXPECTED_UTC), + ("2026/03/04 14:18:00", EXPECTED_UTC), + # Timezone variations + ("2026-03-04T15:18:00+01:00", EXPECTED_UTC), + ("2026-03-04T13:18:00-01:00", EXPECTED_UTC), + # High precision and different separators + ("2026-03-04T14:18:00.123Z", EXPECTED_UTC), + ("2026-03-04 14:18:00,000", EXPECTED_UTC), + # Common human-readable variations + ("Wednesday, March 4, 2026 14:18:00", EXPECTED_UTC), + ] + for time_str, expected in test_cases: + schema = schemas.ParserSchema({**dummy_schema, "logFormatVariables": {"Time": time_str}}) + result = _extract_timestamp(schema) + assert result == [expected], ( + f"Format '{time_str}': expected [{expected}], got {result}" + ) diff --git a/tests/test_detectors/test_new_value_detector.py b/tests/test_detectors/test_new_value_detector.py index cf2ed70..f3bdef9 100644 --- a/tests/test_detectors/test_new_value_detector.py +++ b/tests/test_detectors/test_new_value_detector.py @@ -8,7 +8,11 @@ - Input/output schema validation """ -from detectmatelibrary.detectors.new_value_detector import NewValueDetector, BufferMode +from detectmatelibrary.detectors.new_value_detector import ( + NewValueDetector, NewValueDetectorConfig, BufferMode +) +from detectmatelibrary.common.core import ConfigState, TrainState +from detectmatelibrary.constants import GLOBAL_EVENT_ID from detectmatelibrary.parsers.template_matcher import MatcherParser from detectmatelibrary.helper.from_to import From import detectmatelibrary.schemas as schemas @@ -233,3 +237,77 @@ def test_audit_log_anomalies(self): detected_ids.add(log["logID"]) assert detected_ids == {'1859', '1860', '1861', '1862', '1864', '1865', '1866', '1867'} + + +class TestNewValueDetectorAutoConfig: + """Test that process() drives configure/set_configuration/train/detect + automatically.""" + + def test_audit_log_anomalies_via_process(self): + parser = MatcherParser(config=_PARSER_CONFIG) + detector = NewValueDetector() + + logs = list(From.log(parser, in_path="tests/test_folder/audit.log", do_process=True)) + + # Phase 1: configure — keep configuring for logs[:1800] + detector.configure_state = ConfigState.KEEP_CONFIGURE + for log in logs[:1800]: + detector.process(log) + + # Transition: stop configure so next process() call triggers set_configuration() + detector.configure_state = ConfigState.STOP_CONFIGURE + + # Phase 2: train — keep training for logs[:1800] + detector.train_state = TrainState.KEEP_TRAINING + for log in logs[:1800]: + detector.process(log) + + # Phase 3: detect — stop training so process() only calls detect() + detector.train_state = TrainState.STOP_TRAINING + detected_ids: set[str] = set() + for log in logs[1800:]: + if detector.process(log) is not None: + detected_ids.add(log["logID"]) + + assert detected_ids == {'1859', '1860', '1861', '1862', '1864', '1865', '1866', '1867'} + + +class TestNewValueDetectorGlobalInstances: + """Tests event-ID-independent global instance detection.""" + + def test_global_instance_detects_new_type(self): + """Global instance monitoring Type detects CRED_REFR, USER_AUTH, + USER_CMD which only appear after the training window (line 1800+).""" + parser = MatcherParser(config=_PARSER_CONFIG) + config_dict = { + "detectors": { + "NewValueDetector": { + "method_type": "new_value_detector", + "auto_config": False, + "global": { + "test": { + "header_variables": [{"pos": "Type"}] + } + } + } + } + } + config = NewValueDetectorConfig.from_dict(config_dict, "NewValueDetector") + detector = NewValueDetector(config=config) + + logs = list(From.log(parser, in_path="tests/test_folder/audit.log", do_process=True)) + + for log in logs[:1800]: + detector.train(log) + + # Global tracker must be populated under the sentinel event ID + assert GLOBAL_EVENT_ID in detector.persistency.get_events_data() + + detected_ids: set[str] = set() + for log in logs[1800:]: + output = schemas.DetectorSchema() + if detector.detect(log, output_=output): + assert all(key.startswith("Global -") for key in output["alertsObtain"]) + detected_ids.add(log["logID"]) + + assert len(detected_ids) > 0 diff --git a/tests/test_folder/test_config.yaml b/tests/test_folder/test_config.yaml index 8dd3b6c..ff679b1 100644 --- a/tests/test_folder/test_config.yaml +++ b/tests/test_folder/test_config.yaml @@ -133,6 +133,19 @@ detectors: parser: example_parser_1 auto_config: true + detector_global_instance: + method_type: ExampleDetector + parser: example_parser_1 + auto_config: false + params: {} + global: + global_monitor: + header_variables: + - pos: Level + params: + threshold: 0.2 + - pos: Time + NewValueDetector: method_type: new_value_detector parser: example_parser_1 diff --git a/tests/test_pipelines/test_configuration_engine.py b/tests/test_pipelines/test_configuration_engine.py new file mode 100644 index 0000000..493829a --- /dev/null +++ b/tests/test_pipelines/test_configuration_engine.py @@ -0,0 +1,89 @@ +from detectmatelibrary.detectors.new_value_detector import NewValueDetector, NewValueDetectorConfig +from detectmatelibrary.parsers.template_matcher import MatcherParser +from detectmatelibrary.helper.from_to import From + +import json + +AUDIT_LOG = "tests/test_folder/audit.log" +AUDIT_TEMPLATES = "tests/test_folder/audit_templates.txt" +ANOMALY_LABELS = "tests/test_folder/audit_anomaly_labels.log" +LOG_FORMAT = "type= msg=audit(