From 844454854dc503904e09e6ce0b86852d06f1212c Mon Sep 17 00:00:00 2001 From: laurenzzumschlinge Date: Thu, 18 Jun 2026 20:42:33 +0200 Subject: [PATCH] ci: set up GitHub Actions workflow and requirements.txt for automated testing --- .github/workflows/ci.yml | 43 +++++++++++++++++++++++++++++++++++ requirements.txt | 5 ++++ src/pipelines/run_pipeline.py | 12 ++++++---- src/tests/test_pipeline.py | 16 ++++++------- 4 files changed, 63 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 requirements.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bad52e4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: Philea CI Pipeline + +on: + push: + branches: [ main, dev ] + pull_request: + branches: [ main, dev ] + +jobs: + ci: + runs-on: ubuntu-latest + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run Linter (Flake8) + run: | + # Stop the build if there are Python syntax errors or undefined names + flake8 src/ --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 src/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Syntax Compile Check (Build) + run: | + python -m compileall src/ + + - name: Run Unit Tests + env: + GEMINI_API_KEY: "dummy_key" + run: | + PYTHONPATH=src python -m unittest discover -s src/tests diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..22c9327 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.31.0 +beautifulsoup4>=4.12.0 +google-genai>=2.0.0 +pydantic>=2.0.0 +flake8>=7.0.0 diff --git a/src/pipelines/run_pipeline.py b/src/pipelines/run_pipeline.py index 6ff6caa..688f11d 100644 --- a/src/pipelines/run_pipeline.py +++ b/src/pipelines/run_pipeline.py @@ -3,12 +3,14 @@ import argparse import logging -# Ensure project root is in sys.path so imports work regardless of working directory -PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) -if PROJECT_ROOT not in sys.path: - sys.path.insert(0, PROJECT_ROOT) +# Ensure src directory is in sys.path so imports work regardless of working directory +PIPELINES_DIR = os.path.dirname(os.path.abspath(__file__)) +SRC_DIR = os.path.dirname(PIPELINES_DIR) +PROJECT_ROOT = SRC_DIR +if SRC_DIR not in sys.path: + sys.path.insert(0, SRC_DIR) -from scraper.philea import scrape, save_data as save_raw_data +from scrapers.philea import scrape, save_data as save_raw_data from preprocessing.extract_geo_topic import extract_tags, extract_geo, save_data as save_preprocessed_data, load_data # Configure logger diff --git a/src/tests/test_pipeline.py b/src/tests/test_pipeline.py index 8b3906d..0ddef74 100644 --- a/src/tests/test_pipeline.py +++ b/src/tests/test_pipeline.py @@ -7,12 +7,12 @@ # Add project root to sys.path so we can import scraper and preprocessing sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from scraper.philea import make_request, scrape +from scrapers.philea import make_request, scrape from preprocessing.extract_geo_topic import extract_tags, extract_geo class TestPhileaScraper(unittest.TestCase): - @patch("scraper.philea.requests.request") + @patch("scrapers.philea.requests.request") def test_make_request_success(self, mock_request): # Setup mock response mock_resp = MagicMock() @@ -23,8 +23,8 @@ def test_make_request_success(self, mock_request): self.assertEqual(resp.status_code, 200) mock_request.assert_called_once_with("GET", "https://example.com", timeout=10) - @patch("scraper.philea.requests.request") - @patch("scraper.philea.time.sleep") # Mock sleep to speed up test run + @patch("scrapers.philea.requests.request") + @patch("scrapers.philea.time.sleep") # Mock sleep to speed up test run def test_make_request_retry_on_transient_error(self, mock_sleep, mock_request): # Setup mock responses: first two attempts return 500, third returns 200 mock_resp_fail = MagicMock() @@ -40,8 +40,8 @@ def test_make_request_retry_on_transient_error(self, mock_sleep, mock_request): self.assertEqual(mock_request.call_count, 3) self.assertEqual(mock_sleep.call_count, 2) - @patch("scraper.philea.requests.request") - @patch("scraper.philea.time.sleep") + @patch("scrapers.philea.requests.request") + @patch("scrapers.philea.time.sleep") def test_make_request_persistent_failure(self, mock_sleep, mock_request): # All requests fail with ConnectionError mock_request.side_effect = requests.exceptions.ConnectionError("Connection failed") @@ -50,8 +50,8 @@ def test_make_request_persistent_failure(self, mock_sleep, mock_request): make_request("GET", "https://example.com", max_retries=3, backoff_factor=0.1) self.assertEqual(mock_request.call_count, 3) - @patch("scraper.philea.make_request") - @patch("scraper.philea.BeautifulSoup") + @patch("scrapers.philea.make_request") + @patch("scrapers.philea.BeautifulSoup") def test_scrape_limit(self, mock_bs, mock_make_request): # Mock members list call mock_list_resp = MagicMock()