From f621ffcf2308e95ff0c68ef8c1f6f95e34c504cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=A3=20Bida=20Vacaro?= Date: Wed, 13 May 2026 18:38:20 -0300 Subject: [PATCH] docs: include more documentation and docstrings --- .github/workflows/release.yaml | 56 ++ README.md | 10 + docs/requirements.txt | 1 + docs/source/api.rst | 137 ++++ docs/source/conf.py | 16 +- docs/source/databases/CNES.ipynb | 895 -------------------- docs/source/databases/SIA.ipynb | 694 ---------------- docs/source/databases/SIH.ipynb | 685 ---------------- docs/source/databases/SIM.ipynb | 705 ---------------- docs/source/databases/SINAN.ipynb | 1222 ---------------------------- docs/source/databases/SINASC.ipynb | 693 ---------------- docs/source/index.rst | 2 +- pysus/api/__init__.py | 6 + pysus/api/_impl/__init__.py | 6 + pysus/api/_impl/databases.py | 74 +- pysus/api/client.py | 48 +- pysus/api/dadosgov/__init__.py | 2 + pysus/api/dadosgov/client.py | 22 + pysus/api/dadosgov/databases.py | 32 + pysus/api/dadosgov/models.py | 26 + pysus/api/ducklake/__init__.py | 6 + pysus/api/ducklake/catalog.py | 20 + pysus/api/ducklake/client.py | 36 + pysus/api/ducklake/models.py | 28 + pysus/api/extensions.py | 103 ++- pysus/api/ftp/__init__.py | 2 + pysus/api/ftp/client.py | 23 + pysus/api/ftp/databases.py | 56 ++ pysus/api/ftp/models.py | 40 +- pysus/api/models.py | 145 +++- pysus/api/types.py | 10 + 31 files changed, 868 insertions(+), 4933 deletions(-) create mode 100644 docs/source/api.rst delete mode 100644 docs/source/databases/CNES.ipynb delete mode 100644 docs/source/databases/SIA.ipynb delete mode 100644 docs/source/databases/SIH.ipynb delete mode 100644 docs/source/databases/SIM.ipynb delete mode 100644 docs/source/databases/SINAN.ipynb delete mode 100644 docs/source/databases/SINASC.ipynb diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 930a3d62..a48c0331 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -7,6 +7,10 @@ on: pull_request: branches: [ main ] +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + jobs: build: runs-on: ubuntu-latest @@ -39,3 +43,55 @@ jobs: run: | poetry config pypi-token.pypi ${PYPI_TOKEN} make release + + docs: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + permissions: + contents: read + pages: write + id-token: write + + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - uses: actions/checkout@v4 + + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest + environment-file: conda/dev.yaml + channels: conda-forge,nodefaults + activate-environment: pysus + auto-update-conda: true + conda-solver: libmamba + + - name: Install dependencies + run: | + pip install poetry poetry-plugin-export + poetry config virtualenvs.create false + poetry install --with docs --extras dbc + + - name: Build docs + run: | + cd docs + make html + + - name: Configure GitHub Pages + uses: actions/configure-pages@v5 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/build/html + + - name: Deploy to GitHub Pages + if: github.ref == 'refs/heads/main' + id: deployment + uses: actions/deploy-pages@v4 diff --git a/README.md b/README.md index 9391b203..dd403dfc 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,16 @@ df = sih(state="SP", year=2024, month=[1, 2, 3]) df = cnes(state="SP", year=2024, month=1) ``` +### Listing the files + +You can also list the files within the dataset to check which files are available to download + +```python +from pysus import list_files + +list_files("SINAN") +``` + ### Using the PySUS Client ```python diff --git a/docs/requirements.txt b/docs/requirements.txt index 1dcfb82a..f2df05cd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ nbsphinx sphinx sphinx-rtd-theme +standard-imghdr diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 00000000..b4a507e4 --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,137 @@ +API Reference +============= + +The ``pysus.api`` package provides a layered architecture for discovering, +downloading, and reading data from Brazilian public health databases +(DATASUS). It supports three remote data sources. + +Architecture Overview +--------------------- + +The package is organized into a hierarchy of abstract base classes and +concrete implementations:: + + pysus/api/ + ├── __init__.py # Package entry (re-exports PySUS) + ├── client.py # Main PySUS orchestrator + ├── extensions.py # File format handlers + ├── models.py # Abstract base classes + ├── types.py # Type aliases + ├── _impl/ + │ └── databases.py # High-level convenience functions + ├── ducklake/ # S3 DuckLake catalog client + ├── ftp/ # FTP client + └── dadosgov/ # dados.gov.br API client + +Quick Start +----------- + +The simplest way to use PySUS is via the high-level convenience +functions:: + + from pysus import sinan + + df = sinan(disease="dengue", year=2023) + +Or with the async API:: + + from pysus.api.client import PySUS + + async with PySUS() as pysus: + files = await pysus.query(dataset="sinan", group="DENG", year=2023) + for f in files: + await pysus.download(f) + + +Main Client +----------- + +.. automodule:: pysus.api.client + :members: + :undoc-members: + :show-inheritance: + +Types +----- + +.. automodule:: pysus.api.types + :members: + :undoc-members: + +File Format Handlers +-------------------- + +.. automodule:: pysus.api.extensions + :members: + :undoc-members: + :show-inheritance: + +Abstract Base Models +-------------------- + +.. automodule:: pysus.api.models + :members: + :undoc-members: + :show-inheritance: + +High-Level Data Functions +------------------------- + +.. automodule:: pysus.api._impl.databases + :members: + :undoc-members: + :show-inheritance: + +DuckLake Client +--------------- + +.. automodule:: pysus.api.ducklake.client + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.ducklake.catalog + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.ducklake.models + :members: + :undoc-members: + :show-inheritance: + +FTP Client +---------- + +.. automodule:: pysus.api.ftp.client + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.ftp.databases + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.ftp.models + :members: + :undoc-members: + :show-inheritance: + +DadosGov Client +--------------- + +.. automodule:: pysus.api.dadosgov.client + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.dadosgov.databases + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: pysus.api.dadosgov.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 006a18d1..61426cec 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,13 +12,10 @@ # All configuration values have a default; values that are commented out # serve to show the default. -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ @@ -33,9 +30,14 @@ "sphinx.ext.autodoc", "sphinx.ext.mathjax", "sphinx.ext.viewcode", + "sphinx.ext.intersphinx", "nbsphinx", ] +intersphinx_mapping = { + "sqlalchemy": ("https://docs.sqlalchemy.org/en/20/", None), +} + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/docs/source/databases/CNES.ipynb b/docs/source/databases/CNES.ipynb deleted file mode 100644 index 2a00576f..00000000 --- a/docs/source/databases/CNES.ipynb +++ /dev/null @@ -1,895 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "tFRs4aDTsdZb" - }, - "source": [ - "# CNES FTP Database\n", - "\n", - "Code to work with CNES (Cadastro Nacional de Estabelecimentos de Saúde) directories and files inside DATASUS FTP\n", - "\r\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from pysus import CNES\n", - "cnes = CNES()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Cadastro Nacional de Estabelecimentos de Saúde',\n", - " 'source': 'https://cnes.datasus.gov.br/',\n", - " 'description': 'O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o sistema de informação oficial de cadastramento de informações de todos os estabelecimentos de saúde no país, independentemente de sua natureza jurídica ou de integrarem o Sistema Único de Saúde (SUS). Trata-se do cadastro oficial do Ministério da Saúde (MS) no tocante à realidade da capacidade instalada e mão-de-obra assistencial de saúde no Brasil em estabelecimentos de saúde públicos ou privados, com convênio SUS ou não.'}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cnes.metadata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "CNES FTP Database has lazy loading (also applied to Directories), therefore its content will require explict `load()` to be displayed:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-19 11:57:40.905 | INFO | pysus.ftp:content:440 - content is not loaded, use `load()` to load default paths\n" - ] - }, - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cnes.content" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNES - Cadastro Nacional de Estabelecimentos de Saúde" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cnes.load() # Loads default CNES content (from cnes.paths)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[/dissemin/publicos/CNES/200508_/Dados/DC,\n", - " /dissemin/publicos/CNES/200508_/Dados/EE,\n", - " /dissemin/publicos/CNES/200508_/Dados/EF,\n", - " /dissemin/publicos/CNES/200508_/Dados/EP,\n", - " /dissemin/publicos/CNES/200508_/Dados/EQ,\n", - " /dissemin/publicos/CNES/200508_/Dados/GM,\n", - " /dissemin/publicos/CNES/200508_/Dados/HB,\n", - " /dissemin/publicos/CNES/200508_/Dados/IN,\n", - " /dissemin/publicos/CNES/200508_/Dados/LT,\n", - " /dissemin/publicos/CNES/200508_/Dados/PF,\n", - " /dissemin/publicos/CNES/200508_/Dados/RC,\n", - " /dissemin/publicos/CNES/200508_/Dados/SR,\n", - " /dissemin/publicos/CNES/200508_/Dados/ST]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Default content for (loaded) CNES database \n", - "cnes.content" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## CNES Groups" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'DC': 'Dados Complementares',\n", - " 'EE': 'Estabelecimento de Ensino',\n", - " 'EF': 'Estabelecimento Filantrópico',\n", - " 'EP': 'Equipes',\n", - " 'EQ': 'Equipamentos',\n", - " 'GM': 'Gestão e Metas',\n", - " 'HB': 'Habilitação',\n", - " 'IN': 'Incentivos',\n", - " 'LT': 'Leitos',\n", - " 'PF': 'Profissional',\n", - " 'RC': 'Regra Contratual',\n", - " 'SR': 'Serviço Especializado',\n", - " 'ST': 'Estabelecimentos'}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cnes.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading groups\n", - "\n", - "To load specific groups into `cnes` content, it's possible to pass them in the `load()` method:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "17838 files loaded\n" - ] - } - ], - "source": [ - "cnes.load(\"DC\")\n", - "cnes.load([\"ST\", \"SR\"])\n", - "print(str(len(cnes.content)) + \" files loaded\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filtering files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### by group (required)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5940" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get files by group\n", - "len(cnes.get_files(\"SR\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11887" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Multiple groups\n", - "len(cnes.get_files([\"ST\", \"SR\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### by UF (state)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SR - São Paulo files: 220\n" - ] - } - ], - "source": [ - "# Get files by UF from group\n", - "print(\"SR - São Paulo files: \" + str(len( cnes.get_files(\"SR\", uf=\"SP\" ))))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "440" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Multiple UFs\n", - "len(cnes.get_files(\"SR\", uf=[\"SP\", \"RJ\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### by year" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "216" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(cnes.get_files(\"SR\", year=2023)) # or 23" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1243" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Multiple Years\n", - "len(cnes.get_files(\"SR\", year=[20, 21, 22, 23]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### by month" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1945" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(cnes.get_files(\"SR\", month=[1, 2, 3, 4])) # or single month" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Combining filters" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[SRSP2301.dbc, SRSP2302.dbc, SRSP2303.dbc]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "files = cnes.get_files(\"SR\", uf=\"SP\", year=2023, month=[1,2,3])\n", - "files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Describing Files" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group': 'Serviço Especializado',\n", - " 'last_update': '2023-02-17 07:31AM',\n", - " 'month': 'Janeiro',\n", - " 'name': 'SRSP2301.dbc',\n", - " 'size': '1.6 MB',\n", - " 'uf': 'São Paulo',\n", - " 'year': 2023}\n", - "{'group': 'Serviço Especializado',\n", - " 'last_update': '2023-03-14 02:34PM',\n", - " 'month': 'Fevereiro',\n", - " 'name': 'SRSP2302.dbc',\n", - " 'size': '1.6 MB',\n", - " 'uf': 'São Paulo',\n", - " 'year': 2023}\n", - "{'group': 'Serviço Especializado',\n", - " 'last_update': '2023-04-17 07:34AM',\n", - " 'month': 'Março',\n", - " 'name': 'SRSP2303.dbc',\n", - " 'size': '1.6 MB',\n", - " 'uf': 'São Paulo',\n", - " 'year': 2023}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "for file in files:\n", - " pprint(cnes.describe(file))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Downloading Files" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "SRSP2303.parquet: 100%|██████████| 81.4k/81.4k [00:05<00:00, 14.8kB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/SRSP2301.parquet,\n", - " /home/bida/pysus/SRSP2302.parquet,\n", - " /home/bida/pysus/SRSP2303.parquet]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquets = cnes.download(files)\n", - "parquets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CNESCODUFMUNSERV_ESPCLASS_SRSRVUNICOREGSAUDEMICR_REGDISTRSANDISTRADMTPGESTAO...CNPJ_MANCARACTERAMB_NSUSAMB_SUSHOSP_NSUSHOSP_SUSCOMPETENCONTSRVUCNESTERCNAT_JUR
027909203514501450080206M...00000000000000110002023012062
127909203514501450090206M...00000000000000110002023012062
227909203514501450100206M...00000000000000110002023012062
327909203514501450130206M...00000000000000110002023012062
430254893514501110011110206M...461374850001601010020230111244
..................................................................
1448300008885354780141002M...46522942000130101002023011244
1448310008885354780144001144M...465229420001301010020230111244
1448320008885354780159001159M...465229420001301010020230111244
1448330008885354780159004M...46522942000130101002023011244
1448340008885354780159005M...46522942000130101002023011244
\n", - "

144835 rows × 32 columns

\n", - "
" - ], - "text/plain": [ - " CNES CODUFMUN SERV_ESP CLASS_SR SRVUNICO REGSAUDE MICR_REG \\\n", - "0 2790920 351450 145 008 0206 \n", - "1 2790920 351450 145 009 0206 \n", - "2 2790920 351450 145 010 0206 \n", - "3 2790920 351450 145 013 0206 \n", - "4 3025489 351450 111 001 111 0206 \n", - "... ... ... ... ... ... ... ... \n", - "144830 0008885 354780 141 002 \n", - "144831 0008885 354780 144 001 144 \n", - "144832 0008885 354780 159 001 159 \n", - "144833 0008885 354780 159 004 \n", - "144834 0008885 354780 159 005 \n", - "\n", - " DISTRSAN DISTRADM TPGESTAO ... CNPJ_MAN CARACTER AMB_NSUS \\\n", - "0 M ... 00000000000000 1 1 \n", - "1 M ... 00000000000000 1 1 \n", - "2 M ... 00000000000000 1 1 \n", - "3 M ... 00000000000000 1 1 \n", - "4 M ... 46137485000160 1 0 \n", - "... ... ... ... ... ... ... ... \n", - "144830 M ... 46522942000130 1 0 \n", - "144831 M ... 46522942000130 1 0 \n", - "144832 M ... 46522942000130 1 0 \n", - "144833 M ... 46522942000130 1 0 \n", - "144834 M ... 46522942000130 1 0 \n", - "\n", - " AMB_SUS HOSP_NSUS HOSP_SUS COMPETEN CONTSRVU CNESTERC NAT_JUR \n", - "0 0 0 0 202301 2062 \n", - "1 0 0 0 202301 2062 \n", - "2 0 0 0 202301 2062 \n", - "3 0 0 0 202301 2062 \n", - "4 1 0 0 202301 1 1244 \n", - "... ... ... ... ... ... ... ... \n", - "144830 1 0 0 202301 1244 \n", - "144831 1 0 0 202301 1 1244 \n", - "144832 1 0 0 202301 1 1244 \n", - "144833 1 0 0 202301 1244 \n", - "144834 1 0 0 202301 1244 \n", - "\n", - "[144835 rows x 32 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquets[0].to_dataframe()" - ] - } - ], - "metadata": { - "colab": { - "name": "Getting CNES Data.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/databases/SIA.ipynb b/docs/source/databases/SIA.ipynb deleted file mode 100644 index d201580a..00000000 --- a/docs/source/databases/SIA.ipynb +++ /dev/null @@ -1,694 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a73920ee-3902-4270-a17d-b7907d8561d7", - "metadata": {}, - "source": [ - "# SIA FTP Database\n", - "##### Sistema de Informações Ambulatoriais" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "8d619b0d-300c-4bc2-a738-bf96d650311d", - "metadata": {}, - "outputs": [], - "source": [ - "from pysus import SIA\n", - "sia = SIA().load() # Loads the files from DATASUS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4d50b674-b5bd-4ec5-a812-15c680841879", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SIA - Sistema de Informações Ambulatoriais" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5a74172b-841e-4ba7-bcc6-41ec9a216423", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Sistema de Informações Ambulatoriais',\n", - " 'source': 'http://sia.datasus.gov.br/principal/index.php',\n", - " 'description': 'O Sistema de Informação Ambulatorial (SIA) foi instituído pela Portaria GM/MS n.º 896 de 29 de junho de 1990. Originalmente, o SIA foi concebido a partir do projeto SICAPS (Sistema de Informação e Controle Ambulatorial da Previdência Social), em que os conceitos, os objetivos e as diretrizes criados para o desenvolvimento do SICAPS foram extremamente importantes e amplamente utilizados para o desenvolvimento do SIA, tais como: (i) o acompanhamento das programações físicas e orçamentárias; (ii) o acompanhamento das ações de saúde produzidas; (iii) a agilização do pagamento e controle orçamentário e financeiro; e (iv) a formação de banco de dados para contribuir com a construção do SUS.'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia.metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "10e69ba8-baa1-4718-b53e-40af23084324", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'AB': 'APAC de Cirurgia Bariátrica',\n", - " 'ABO': 'APAC de Acompanhamento Pós Cirurgia Bariátrica',\n", - " 'ACF': 'APAC de Confecção de Fístula',\n", - " 'AD': 'APAC de Laudos Diversos',\n", - " 'AM': 'APAC de Medicamentos',\n", - " 'AMP': 'APAC de Acompanhamento Multiprofissional',\n", - " 'AN': 'APAC de Nefrologia',\n", - " 'AQ': 'APAC de Quimioterapia',\n", - " 'AR': 'APAC de Radioterapia',\n", - " 'ATD': 'APAC de Tratamento Dialítico',\n", - " 'BI': 'Boletim de Produção Ambulatorial individualizado',\n", - " 'IMPBO': '',\n", - " 'PA': 'Produção Ambulatorial',\n", - " 'PAM': '',\n", - " 'PAR': '',\n", - " 'PAS': '',\n", - " 'PS': 'RAAS Psicossocial',\n", - " 'SAD': 'RAAS de Atenção Domiciliar'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia.groups" - ] - }, - { - "cell_type": "markdown", - "id": "ff0b298c-69cf-4884-b7c8-2936de2c3508", - "metadata": {}, - "source": [ - "### Getting specific files" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3910f87e-965b-4a5e-8fb2-9c9ad257d0f7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "14300 files\n" - ] - } - ], - "source": [ - "print(str(len(sia.get_files([\"PA\", \"BI\"]))) + \" files\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c3cd4449-55fd-418d-a6e0-53ff38cd9258", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[PASP0001.dbc,\n", - " PASP0002.dbc,\n", - " PASP0004.dbc,\n", - " PASP0005.dbc,\n", - " Pasp0006.dbc,\n", - " pasp0003.dbc,\n", - " pasp0007.dbc,\n", - " pasp0008.dbc,\n", - " pasp0009.dbc,\n", - " pasp0010.dbc,\n", - " pasp0011.dbc,\n", - " pasp0012.dbc]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia.get_files(\"PA\", uf=\"SP\", year=2000)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3883f518-c21e-4334-b18e-7fd9127aa83f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[PASP0001.dbc, PASP0002.dbc]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "files = sia.get_files(\"PA\", uf=\"SP\", year=2000, month=[1,2])\n", - "files" - ] - }, - { - "cell_type": "markdown", - "id": "c065668a-0157-40cb-8662-0daf9db2937b", - "metadata": {}, - "source": [ - "### Describing files inside DATASUS server" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "017d4445-f172-4376-ba85-19c06f9d1de4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'PASP0001.dbc',\n", - " 'group': 'Produção Ambulatorial',\n", - " 'uf': 'São Paulo',\n", - " 'month': 'Janeiro',\n", - " 'year': 2000,\n", - " 'size': '7.2 MB',\n", - " 'last_update': '2013-10-24 04:18PM'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia.describe(files[0])" - ] - }, - { - "cell_type": "markdown", - "id": "59039f48-9c9b-4807-81f6-24d2139a70b8", - "metadata": {}, - "source": [ - "### Downloading files" - ] - }, - { - "cell_type": "markdown", - "id": "dc671449-d4d3-4e32-9afd-68f0d844a104", - "metadata": {}, - "source": [ - "You can rather download multiple files or download them individually:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8f53a375-266b-4290-8705-408d236fd6d1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "PASP0002.parquet: 100%|████████████| 447k/447k [00:20<00:00, 21.6kB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/PASP0001.parquet, /home/bida/pysus/PASP0002.parquet]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sia.download(files) # or specify a directory with `local_dir=` " - ] - }, - { - "cell_type": "markdown", - "id": "70e10d61-d2a7-49ed-a409-38175321df04", - "metadata": {}, - "source": [ - "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. " - ] - }, - { - "cell_type": "markdown", - "id": "886389c2-5c26-43c2-9820-0c3fa9d85021", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4d2ddb3e-0d04-4b3c-952f-12ddd658751b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████| 7.73M/7.73M [00:00<00:00, 4.84GB/s]\n" - ] - } - ], - "source": [ - "parquet = sia.download(files)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "0ef3fb2b-af49-4744-9692-410c4c4820b1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PA_CONDICPA_GESTAOPA_CODUNIPA_DATREFPA_CODPROPA_DOCORIGPA_CODESPPA_TIPPROPA_TIPATEPA_FXETAR...PA_DATPRPA_VALPROPA_VALAPRPA_UFMUNPA_MUNATPA_NUMAPAPA_CODOCOPA_CIDPRIPA_CIDSECPA_MORFOL
0EC3599990008672000010701223B270299...20000158.6558.6535503035503000000000000S01
1EC3599990008672000010701223B270299...20000115.3015.3035503035503000000000000S01
2EC3599990008672000010701223B270299...20000112.7512.7535503035503000000000000S01
3EC3599990008672000010701223B270299...20000112.7512.7535503035503000000000000S01
4EC3599990008672000010701223B270299...2000015.105.1035503035503000000000000S01
..................................................................
725999EA3599990161272000010102302B770000...2000013972.003972.0035503035503000000000000S01
726000EA3599990161272000010302206B300000...2000012.482.4835503035503000000000000S01
726001EA3599990161272000010302207B300000...2000016.396.3935503035503000000000000S01
726002EA3599990161272000010304103B300000...2000017.927.9235503035503000000000000S01
726003EA3599990161272000010401103B010027...2000012.002.0035503035503000000000000S01
\n", - "

726004 rows × 24 columns

\n", - "
" - ], - "text/plain": [ - " PA_CONDIC PA_GESTAO PA_CODUNI PA_DATREF PA_CODPRO PA_DOCORIG PA_CODESP \\\n", - "0 EC 359999 000867 200001 0701223 B 27 \n", - "1 EC 359999 000867 200001 0701223 B 27 \n", - "2 EC 359999 000867 200001 0701223 B 27 \n", - "3 EC 359999 000867 200001 0701223 B 27 \n", - "4 EC 359999 000867 200001 0701223 B 27 \n", - "... ... ... ... ... ... ... ... \n", - "725999 EA 359999 016127 200001 0102302 B 77 \n", - "726000 EA 359999 016127 200001 0302206 B 30 \n", - "726001 EA 359999 016127 200001 0302207 B 30 \n", - "726002 EA 359999 016127 200001 0304103 B 30 \n", - "726003 EA 359999 016127 200001 0401103 B 01 \n", - "\n", - " PA_TIPPRO PA_TIPATE PA_FXETAR ... PA_DATPR PA_VALPRO PA_VALAPR \\\n", - "0 02 99 ... 200001 58.65 58.65 \n", - "1 02 99 ... 200001 15.30 15.30 \n", - "2 02 99 ... 200001 12.75 12.75 \n", - "3 02 99 ... 200001 12.75 12.75 \n", - "4 02 99 ... 200001 5.10 5.10 \n", - "... ... ... ... ... ... ... ... \n", - "725999 00 00 ... 200001 3972.00 3972.00 \n", - "726000 00 00 ... 200001 2.48 2.48 \n", - "726001 00 00 ... 200001 6.39 6.39 \n", - "726002 00 00 ... 200001 7.92 7.92 \n", - "726003 00 27 ... 200001 2.00 2.00 \n", - "\n", - " PA_UFMUN PA_MUNAT PA_NUMAPA PA_CODOCO PA_CIDPRI PA_CIDSEC PA_MORFOL \n", - "0 355030 355030 00000000000 S01 \n", - "1 355030 355030 00000000000 S01 \n", - "2 355030 355030 00000000000 S01 \n", - "3 355030 355030 00000000000 S01 \n", - "4 355030 355030 00000000000 S01 \n", - "... ... ... ... ... ... ... ... \n", - "725999 355030 355030 00000000000 S01 \n", - "726000 355030 355030 00000000000 S01 \n", - "726001 355030 355030 00000000000 S01 \n", - "726002 355030 355030 00000000000 S01 \n", - "726003 355030 355030 00000000000 S01 \n", - "\n", - "[726004 rows x 24 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/databases/SIH.ipynb b/docs/source/databases/SIH.ipynb deleted file mode 100644 index c86615cb..00000000 --- a/docs/source/databases/SIH.ipynb +++ /dev/null @@ -1,685 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a73920ee-3902-4270-a17d-b7907d8561d7", - "metadata": {}, - "source": [ - "# SIH FTP Database\n", - "##### Sistema de Informações Hospitalares" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "8d619b0d-300c-4bc2-a738-bf96d650311d", - "metadata": {}, - "outputs": [], - "source": [ - "from pysus import SIH\n", - "sih = SIH().load() # Loads the files from DATASUS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4d50b674-b5bd-4ec5-a812-15c680841879", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SIH - Sistema de Informações Hospitalares" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5a74172b-841e-4ba7-bcc6-41ec9a216423", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Sistema de Informações Hospitalares',\n", - " 'source': ('https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/',\n", - " 'https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/'),\n", - " 'description': 'A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os atendimentos que provenientes de internações hospitalares que foram financiadas pelo SUS, e após o processamento, gerarem relatórios para os gestores que lhes possibilitem fazer os pagamentos dos estabelecimentos de saúde. Além disso, o nível Federal recebe mensalmente uma base de dados de todas as internações autorizadas (aprovadas ou não para pagamento) para que possam ser repassados às Secretarias de Saúde os valores de Produção de Média e Alta complexidade além dos valores de CNRAC, FAEC e de Hospitais Universitários – em suas variadas formas de contrato de gestão.'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih.metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "10e69ba8-baa1-4718-b53e-40af23084324", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'RD': 'AIH Reduzida',\n", - " 'RJ': 'AIH Rejeitada',\n", - " 'ER': 'AIH Rejeitada com erro',\n", - " 'SP': 'Serviços Profissionais',\n", - " 'CH': 'Cadastro Hospitalar',\n", - " 'CM': ''}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih.groups" - ] - }, - { - "cell_type": "markdown", - "id": "ff0b298c-69cf-4884-b7c8-2936de2c3508", - "metadata": {}, - "source": [ - "### Getting specific files" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3910f87e-965b-4a5e-8fb2-9c9ad257d0f7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15649 files\n" - ] - } - ], - "source": [ - "print(str(len(sih.get_files([\"RD\", \"RJ\"]))) + \" files\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c3cd4449-55fd-418d-a6e0-53ff38cd9258", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[RDSP0001.dbc,\n", - " RDSP0002.dbc,\n", - " RDSP0003.dbc,\n", - " RDSP0004.dbc,\n", - " RDSP0005.dbc,\n", - " RDSP0006.dbc,\n", - " RDSP0007.dbc,\n", - " RDSP0008.dbc,\n", - " RDSP0009.dbc,\n", - " RDSP0010.dbc,\n", - " RDSP0011.dbc,\n", - " RDSP0012.dbc]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih.get_files(\"RD\", uf=\"SP\", year=2000)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3883f518-c21e-4334-b18e-7fd9127aa83f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[RDSP0001.dbc, RDSP0002.dbc, RDSP0003.dbc]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "files = sih.get_files(\"RD\", uf=\"SP\", year=2000, month=[1,2,3])\n", - "files" - ] - }, - { - "cell_type": "markdown", - "id": "c065668a-0157-40cb-8662-0daf9db2937b", - "metadata": {}, - "source": [ - "### Describing files inside DATASUS server" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "017d4445-f172-4376-ba85-19c06f9d1de4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'RDSP0001.dbc',\n", - " 'group': 'AIH Reduzida',\n", - " 'uf': 'São Paulo',\n", - " 'month': 'Janeiro',\n", - " 'year': 2000,\n", - " 'size': '10.1 MB',\n", - " 'last_update': '2013-10-31 01:14PM'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih.describe(files[0])" - ] - }, - { - "cell_type": "markdown", - "id": "59039f48-9c9b-4807-81f6-24d2139a70b8", - "metadata": {}, - "source": [ - "### Downloading files" - ] - }, - { - "cell_type": "markdown", - "id": "dc671449-d4d3-4e32-9afd-68f0d844a104", - "metadata": {}, - "source": [ - "You can rather download multiple files or download them individually:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8f53a375-266b-4290-8705-408d236fd6d1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "RDSP0003.parquet: 100%|████████████| 340k/340k [00:12<00:00, 28.0kB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/RDSP0001.parquet,\n", - " /home/bida/pysus/RDSP0002.parquet,\n", - " /home/bida/pysus/RDSP0003.parquet]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sih.download(files) # Specify a directory with `local_dir=`" - ] - }, - { - "cell_type": "markdown", - "id": "70e10d61-d2a7-49ed-a409-38175321df04", - "metadata": {}, - "source": [ - "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. " - ] - }, - { - "cell_type": "markdown", - "id": "886389c2-5c26-43c2-9820-0c3fa9d85021", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5882ddc1-9dfb-4181-baed-2adb09bf66e8", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████| 8.46M/8.46M [00:00<00:00, 4.04GB/s]\n" - ] - } - ], - "source": [ - "parquet = sih.download(files)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "70db53ae-7105-48d8-9a42-868385dc3982", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
UF_ZIANO_CMPTMES_CMPTESPECCGC_HOSPN_AIHIDENTCEPMUNIC_RESNASC...CAR_INTTOT_PT_SPCPF_AUTHOMONIMONUM_FILHOSINSTRUCID_NOTIFCONTRACEP1CONTRACEP2GESTRISCO
0352000010246523171000287217938083010604009035344019631229...057190000000
1352000010246523171000287217938084110611000035344019620609...051760000000
2352000010246523171000287217938085210618408035344019781207...05360000000
3352000010246523171000287217938086310601002035344019710106...051760000000
4352000010246523171000287217938087410611201035344019710717...051580000000
..................................................................
193819352000010246523171000287217938078610603609035344019740804...057370000000
193820352000010246523171000287217938079710624012035344019670403...057370000000
193821352000010246523171000287217938080810614008035344019741001...057370000000
193822352000010246523171000287217938081910624009035344019721028...057190000000
193823352000010246523171000287217938082010626003035344019721122...057370000000
\n", - "

193824 rows × 60 columns

\n", - "
" - ], - "text/plain": [ - " UF_ZI ANO_CMPT MES_CMPT ESPEC CGC_HOSP N_AIH IDENT \\\n", - "0 35 2000 01 02 46523171000287 2179380830 1 \n", - "1 35 2000 01 02 46523171000287 2179380841 1 \n", - "2 35 2000 01 02 46523171000287 2179380852 1 \n", - "3 35 2000 01 02 46523171000287 2179380863 1 \n", - "4 35 2000 01 02 46523171000287 2179380874 1 \n", - "... ... ... ... ... ... ... ... \n", - "193819 35 2000 01 02 46523171000287 2179380786 1 \n", - "193820 35 2000 01 02 46523171000287 2179380797 1 \n", - "193821 35 2000 01 02 46523171000287 2179380808 1 \n", - "193822 35 2000 01 02 46523171000287 2179380819 1 \n", - "193823 35 2000 01 02 46523171000287 2179380820 1 \n", - "\n", - " CEP MUNIC_RES NASC ... CAR_INT TOT_PT_SP CPF_AUT HOMONIMO \\\n", - "0 06040090 353440 19631229 ... 05 719 \n", - "1 06110000 353440 19620609 ... 05 176 \n", - "2 06184080 353440 19781207 ... 05 36 \n", - "3 06010020 353440 19710106 ... 05 176 \n", - "4 06112010 353440 19710717 ... 05 158 \n", - "... ... ... ... ... ... ... ... ... \n", - "193819 06036090 353440 19740804 ... 05 737 \n", - "193820 06240120 353440 19670403 ... 05 737 \n", - "193821 06140080 353440 19741001 ... 05 737 \n", - "193822 06240090 353440 19721028 ... 05 719 \n", - "193823 06260030 353440 19721122 ... 05 737 \n", - "\n", - " NUM_FILHOS INSTRU CID_NOTIF CONTRACEP1 CONTRACEP2 GESTRISCO \n", - "0 0 0 00 00 0 \n", - "1 0 0 00 00 0 \n", - "2 0 0 00 00 0 \n", - "3 0 0 00 00 0 \n", - "4 0 0 00 00 0 \n", - "... ... ... ... ... ... ... \n", - "193819 0 0 00 00 0 \n", - "193820 0 0 00 00 0 \n", - "193821 0 0 00 00 0 \n", - "193822 0 0 00 00 0 \n", - "193823 0 0 00 00 0 \n", - "\n", - "[193824 rows x 60 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/databases/SIM.ipynb b/docs/source/databases/SIM.ipynb deleted file mode 100644 index 84c00eda..00000000 --- a/docs/source/databases/SIM.ipynb +++ /dev/null @@ -1,705 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# SIM FTP Database\n", - "##### Sistema de Informação sobre Mortalidade\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from pysus import SIM\n", - "sim = SIM().load() # Loads the files from DATASUS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Sistema de Informação sobre Mortalidade',\n", - " 'source': 'http://sim.saude.gov.br',\n", - " 'description': ''}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'CID10': 'DO', 'CID9': 'DOR'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.groups" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[/dissemin/publicos/SIM/CID10/DORES, /dissemin/publicos/SIM/CID9/DORES]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.paths" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more information about CID9 and CID10, visit http://tabnet.saude.es.gov.br/cgi/tabnet/sim/sim96/obtdescr.htm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Getting specific files " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DORSP95.DBC]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.get_files(\"CID9\", uf=\"SP\", year=1995)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[DORJ2019.dbc,\n", - " DORJ2020.dbc,\n", - " DORJ2021.dbc,\n", - " DOSP2019.dbc,\n", - " DOSP2020.dbc,\n", - " DOSP2021.dbc]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.get_files(\"CID10\", uf=[\"SP\", \"RJ\"], year=[2019, 2020, 2021])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "files = sim.get_files([\"CID9\", \"CID10\"], uf=[\"SP\"], year=[1995, 2020])\n", - "sp_cid9, sp_cid10 = files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Describing a file inside DATASUS server" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'DORSP95.DBC',\n", - " 'uf': 'São Paulo',\n", - " 'year': 1995,\n", - " 'group': 'CID9',\n", - " 'size': '8.2 MB',\n", - " 'last_update': '2020-01-31 02:48PM'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.describe(sp_cid9)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'DOSP2020.dbc',\n", - " 'uf': 'São Paulo',\n", - " 'year': 2020,\n", - " 'group': 'CID10',\n", - " 'size': '28.7 MB',\n", - " 'last_update': '2022-03-31 04:19PM'}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.describe(sp_cid10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Downloading files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can rather download multiple files or download them individually:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DORSP95.parquet: 100%|█████████████| 434k/434k [00:12<00:00, 36.0kB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/DORSP95.parquet]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sim.download(sp_cid9) # Downloads to default directory" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "/home/bida/pysus/DORSP95.parquet" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet = sp_cid9.download() # Or in a custom directory with `local_dir=`\n", - "parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
contadorCARTORIOREGISTRODATAREGTIPOBITODATAOBITOESTCIVILSEXODATANASCIDADE...FONTINFOACIDTRABLOCACIDCRITICANUMEXPORTCRSOCORCRSRESRACACORETNIAUFINFORM
018000195100629510022119291003465...0035
118000295100629510023218980317497...0035
218000395100629510032219281002467...0035
318000495100629510033119110613484...0035
418000595100629510041119610914434...0035
..................................................................
22783217999695100429510014119380423457...0035
22783317999795100429510012119470130448...0035
22783417999895100429510013219160113479...0035
22783517999995100629510011119550901440...0035
22783618000095100629510011119700510425...0035
\n", - "

227837 rows × 50 columns

\n", - "
" - ], - "text/plain": [ - " contador CARTORIO REGISTRO DATAREG TIPOBITO DATAOBITO ESTCIVIL \\\n", - "0 180001 951006 2 951002 2 \n", - "1 180002 951006 2 951002 3 \n", - "2 180003 951006 2 951003 2 \n", - "3 180004 951006 2 951003 3 \n", - "4 180005 951006 2 951004 1 \n", - "... ... ... ... ... ... ... ... \n", - "227832 179996 951004 2 951001 4 \n", - "227833 179997 951004 2 951001 2 \n", - "227834 179998 951004 2 951001 3 \n", - "227835 179999 951006 2 951001 1 \n", - "227836 180000 951006 2 951001 1 \n", - "\n", - " SEXO DATANASC IDADE ... FONTINFO ACIDTRAB LOCACID CRITICA \\\n", - "0 1 19291003 465 ... 0 0 \n", - "1 2 18980317 497 ... 0 0 \n", - "2 2 19281002 467 ... 0 0 \n", - "3 1 19110613 484 ... 0 0 \n", - "4 1 19610914 434 ... 0 0 \n", - "... ... ... ... ... ... ... ... ... \n", - "227832 1 19380423 457 ... 0 0 \n", - "227833 1 19470130 448 ... 0 0 \n", - "227834 2 19160113 479 ... 0 0 \n", - "227835 1 19550901 440 ... 0 0 \n", - "227836 1 19700510 425 ... 0 0 \n", - "\n", - " NUMEXPORT CRSOCOR CRSRES RACACOR ETNIA UFINFORM \n", - "0 35 \n", - "1 35 \n", - "2 35 \n", - "3 35 \n", - "4 35 \n", - "... ... ... ... ... ... ... \n", - "227832 35 \n", - "227833 35 \n", - "227834 35 \n", - "227835 35 \n", - "227836 35 \n", - "\n", - "[227837 rows x 50 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - }, - "vscode": { - "interpreter": { - "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/databases/SINAN.ipynb b/docs/source/databases/SINAN.ipynb deleted file mode 100644 index 48318721..00000000 --- a/docs/source/databases/SINAN.ipynb +++ /dev/null @@ -1,1222 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# SINAN FTP Database \n", - "##### Available diseases and years to download\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from pysus import SINAN\n", - "sinan = SINAN().load() # Loads the files from DATASUS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Doenças e Agravos de Notificação',\n", - " 'source': 'https://portalsinan.saude.gov.br/',\n", - " 'description': 'The Notifiable Diseases Information System - Sinan is primarilyfed by the notification and investigation of cases of diseases and conditions listed in the national list of compulsorily notifiable diseases (Consolidation Ordinance No. 4, September 28, 2017, Annex).However, states and municipalities are allowed to include other important health problems in their region, such as difilobotriasis in the municipality of São Paulo. Its effective use enables the dynamic diagnosis of the occurrence of an event in the population, providing evidence for causal explanations of compulsorily notifiable diseases and indicating risks to which people are exposed. This contributes to identifying the epidemiological reality of a specific geographical area. Its systematic, decentralized use contributes to the democratization of information, allowing all healthcare professionals to access and make it available to the community. Therefore, it is a relevant tool to assist in health planning, define intervention priorities, and evaluate the impact of interventions.'}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.metadata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Listing SINAN Codes & Diseases" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ACBI': 'Acidente de trabalho com material biológico',\n", - " 'ACGR': 'Acidente de trabalho',\n", - " 'ANIM': 'Acidente por Animais Peçonhentos',\n", - " 'ANTR': 'Atendimento Antirrabico',\n", - " 'BOTU': 'Botulismo',\n", - " 'CANC': 'Cancêr relacionado ao trabalho',\n", - " 'CHAG': 'Doença de Chagas Aguda',\n", - " 'CHIK': 'Febre de Chikungunya',\n", - " 'COLE': 'Cólera',\n", - " 'COQU': 'Coqueluche',\n", - " 'DENG': 'Dengue',\n", - " 'DERM': 'Dermatoses ocupacionais',\n", - " 'DIFT': 'Difteria',\n", - " 'ESQU': 'Esquistossomose',\n", - " 'EXAN': 'Doença exantemáticas',\n", - " 'FMAC': 'Febre Maculosa',\n", - " 'FTIF': 'Febre Tifóide',\n", - " 'HANS': 'Hanseníase',\n", - " 'HANT': 'Hantavirose',\n", - " 'HEPA': 'Hepatites Virais',\n", - " 'IEXO': 'Intoxicação Exógena',\n", - " 'INFL': 'Influenza Pandêmica',\n", - " 'LEIV': 'Leishmaniose Visceral',\n", - " 'LEPT': 'Leptospirose',\n", - " 'LERD': 'LER/Dort',\n", - " 'LTAN': 'Leishmaniose Tegumentar Americana',\n", - " 'MALA': 'Malária',\n", - " 'MENI': 'Meningite',\n", - " 'MENT': 'Transtornos mentais relacionados ao trabalho',\n", - " 'NTRA': 'Notificação de Tracoma',\n", - " 'PAIR': 'Perda auditiva por ruído relacionado ao trabalho',\n", - " 'PEST': 'Peste',\n", - " 'PFAN': 'Paralisia Flácida Aguda',\n", - " 'PNEU': 'Pneumoconioses realacionadas ao trabalho',\n", - " 'RAIV': 'Raiva',\n", - " 'SDTA': 'Surto Doenças Transmitidas por Alimentos',\n", - " 'SIFA': 'Sífilis Adquirida',\n", - " 'SIFC': 'Sífilis Congênita',\n", - " 'SIFG': 'Sífilis em Gestante',\n", - " 'SRC': 'Síndrome da Rubéola Congênia',\n", - " 'TETA': 'Tétano Acidental',\n", - " 'TETN': 'Tétano Neonatal',\n", - " 'TOXC': 'Toxoplasmose Congênita',\n", - " 'TOXG': 'Toxoplasmose Gestacional',\n", - " 'TRAC': 'Inquérito de Tracoma',\n", - " 'TUBE': 'Tuberculose',\n", - " 'VARC': 'Varicela',\n", - " 'VIOL': 'Violência doméstica, sexual e/ou outras violências',\n", - " 'ZIKA': 'Zika Vírus'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.diseases" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Getting specific files " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[CHIKBR15.dbc,\n", - " CHIKBR16.dbc,\n", - " CHIKBR17.dbc,\n", - " CHIKBR18.dbc,\n", - " CHIKBR19.dbc,\n", - " CHIKBR20.dbc,\n", - " CHIKBR21.dbc,\n", - " CHIKBR22.dbc,\n", - " CHIKBR23.dbc,\n", - " ZIKABR16.dbc,\n", - " ZIKABR17.dbc,\n", - " ZIKABR18.dbc,\n", - " ZIKABR19.dbc,\n", - " ZIKABR20.dbc,\n", - " ZIKABR21.dbc,\n", - " ZIKABR22.dbc,\n", - " ZIKABR23.dbc]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.get_files(dis_code=[\"ZIKA\", \"CHIK\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[BOTUBR23.dbc,\n", - " CHIKBR23.dbc,\n", - " DENGBR23.dbc,\n", - " ESQUBR23.dbc,\n", - " FTIFBR23.dbc,\n", - " HANSBR23.dbc,\n", - " MENIBR23.dbc,\n", - " TOXCBR23.dbc,\n", - " TOXGBR23.dbc,\n", - " TUBEBR23.dbc,\n", - " VARCBR23.dbc,\n", - " ZIKABR23.dbc]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.get_files(year=2023)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DENGBR22.dbc, DENGBR23.dbc, ZIKABR22.dbc, ZIKABR23.dbc]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.get_files(dis_code=[\"DENG\", \"ZIKA\"], year=[2022, 2023])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "files = sinan.get_files(dis_code=\"BOTU\", year=[2007, 2008])\n", - "botu_2007, botu_2008 = files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Describing a file inside DATASUS server" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'BOTUBR07.dbc',\n", - " 'disease': 'Botulismo',\n", - " 'year': 2007,\n", - " 'size': '7.5 kB',\n", - " 'last_update': '2021-11-23 11:55AM'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.describe(botu_2007)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'BOTUBR08.dbc',\n", - " 'disease': 'Botulismo',\n", - " 'year': 2008,\n", - " 'size': '8.3 kB',\n", - " 'last_update': '2021-11-23 11:56AM'}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.describe(botu_2008)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Downloading files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can rather download multiple files or download them individually:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████| 8.35k/8.35k [00:00<00:00, 6.12MB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/BOTUBR07.parquet, /home/bida/pysus/BOTUBR08.parquet]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinan.download(files) # Downloads to default directory" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "/home/bida/pysus/BOTUBR07.parquet" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "botu_2007.download() # or specify a custom directory with `local_dir=`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "parquet = botu_2007.download()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TP_NOTID_AGRAVODT_NOTIFICSEM_NOTNU_ANOSG_UF_NOTID_MUNICIPID_REGIONAID_UNIDADEDT_SIN_PRI...NU_PROTEIDT_LIQUORTP_SENSITITP_MOTORATP_REPETEDS_ALI1DS_ALI2DS_LOCAL1DS_LOCAL2DT_ENCERRA
02A0512007-01-12200702200741410690135623842992006-12-31...532007011222320070222
12A0512007-01-12200702200735355030133120774852007-01-10...TORTA DE PALMITO/ FRANGO .20070323
22A0512007-01-03200701200752521180179123815322006-11-16...20070515
32A0512007-01-08200702200743431410161122469882007-01-03...3120070107SALAMEDOMICILIO
42A0512007-02-27200709200750500830197327572062007-02-27...20070816
52A0512007-02-16200707200735354340134820821872006-12-12...122
62A0512007-02-1520070720073333017022902272007-02-12...272007021420070724
72A0512007-02-28200709200722220800188840096222007-02-28...20070503
82A0512007-05-12200719200727270710153927216432007-05-09...20070716
92A0512007-06-11200724200735354850134920257522007-06-11...20071120
102A0512007-06-22200725200735352340134220237092007-06-12...402007061620070704
112A0512007-06-27200726200729292740138000040572007-06-25...39200706252220070730
122A0512007-07-3020073120075353001000104562007-07-26...832007072812220070730
132A0512007-07-10200728200752520870177923382622007-05-08...20070814
142A0512007-08-28200735200735354850134920797202007-08-17...SALGADO TORTA PAO COM QUEIJO CCASA DE MASSAS ROMANA EM SANTO20070903
152A0512007-08-24200734200735354850134920797202007-08-17...2007081920071025
162A0512007-08-17200733200735355030133120775742007-08-12...4020070813
172A0512007-08-20200734200751510340157824950152007-08-18...20070918
182A0512007-08-23200734200723230440151924821692007-08-19...MORTADELADOMICILIO20071023
192A0512007-09-16200738200731317010146122065952007-09-08...2007091520071017
202A0512007-09-06200736200735354340134820821872007-08-31...
212A0512007-09-04200736200735355220135320816952007-08-31...
222A0512007-10-08200741200735355220135320816952007-10-08...
232A0512007-10-02200740200731314330147321499902007-09-28...192007093020080601
242A0512007-11-06200745200735355220135320787322007-11-03...2007110520080108
252A0512007-11-01200744200752520870177925184062007-10-02...11120071112
\n", - "

26 rows × 140 columns

\n", - "
" - ], - "text/plain": [ - " TP_NOT ID_AGRAVO DT_NOTIFIC SEM_NOT NU_ANO SG_UF_NOT ID_MUNICIP \\\n", - "0 2 A051 2007-01-12 200702 2007 41 410690 \n", - "1 2 A051 2007-01-12 200702 2007 35 355030 \n", - "2 2 A051 2007-01-03 200701 2007 52 521180 \n", - "3 2 A051 2007-01-08 200702 2007 43 431410 \n", - "4 2 A051 2007-02-27 200709 2007 50 500830 \n", - "5 2 A051 2007-02-16 200707 2007 35 354340 \n", - "6 2 A051 2007-02-15 200707 2007 33 330170 \n", - "7 2 A051 2007-02-28 200709 2007 22 220800 \n", - "8 2 A051 2007-05-12 200719 2007 27 270710 \n", - "9 2 A051 2007-06-11 200724 2007 35 354850 \n", - "10 2 A051 2007-06-22 200725 2007 35 352340 \n", - "11 2 A051 2007-06-27 200726 2007 29 292740 \n", - "12 2 A051 2007-07-30 200731 2007 53 530010 \n", - "13 2 A051 2007-07-10 200728 2007 52 520870 \n", - "14 2 A051 2007-08-28 200735 2007 35 354850 \n", - "15 2 A051 2007-08-24 200734 2007 35 354850 \n", - "16 2 A051 2007-08-17 200733 2007 35 355030 \n", - "17 2 A051 2007-08-20 200734 2007 51 510340 \n", - "18 2 A051 2007-08-23 200734 2007 23 230440 \n", - "19 2 A051 2007-09-16 200738 2007 31 317010 \n", - "20 2 A051 2007-09-06 200736 2007 35 354340 \n", - "21 2 A051 2007-09-04 200736 2007 35 355220 \n", - "22 2 A051 2007-10-08 200741 2007 35 355220 \n", - "23 2 A051 2007-10-02 200740 2007 31 314330 \n", - "24 2 A051 2007-11-06 200745 2007 35 355220 \n", - "25 2 A051 2007-11-01 200744 2007 52 520870 \n", - "\n", - " ID_REGIONA ID_UNIDADE DT_SIN_PRI ... NU_PROTEI DT_LIQUOR TP_SENSITI \\\n", - "0 1356 2384299 2006-12-31 ... 53 20070112 2 \n", - "1 1331 2077485 2007-01-10 ... \n", - "2 1791 2381532 2006-11-16 ... \n", - "3 1611 2246988 2007-01-03 ... 31 20070107 \n", - "4 1973 2757206 2007-02-27 ... \n", - "5 1348 2082187 2006-12-12 ... 1 \n", - "6 2290227 2007-02-12 ... 27 20070214 \n", - "7 1888 4009622 2007-02-28 ... \n", - "8 1539 2721643 2007-05-09 ... \n", - "9 1349 2025752 2007-06-11 ... \n", - "10 1342 2023709 2007-06-12 ... 40 20070616 \n", - "11 1380 0004057 2007-06-25 ... 39 20070625 2 \n", - "12 0010456 2007-07-26 ... 83 20070728 1 \n", - "13 1779 2338262 2007-05-08 ... \n", - "14 1349 2079720 2007-08-17 ... \n", - "15 1349 2079720 2007-08-17 ... 20070819 \n", - "16 1331 2077574 2007-08-12 ... 40 20070813 \n", - "17 1578 2495015 2007-08-18 ... \n", - "18 1519 2482169 2007-08-19 ... \n", - "19 1461 2206595 2007-09-08 ... 20070915 \n", - "20 1348 2082187 2007-08-31 ... \n", - "21 1353 2081695 2007-08-31 ... \n", - "22 1353 2081695 2007-10-08 ... \n", - "23 1473 2149990 2007-09-28 ... 19 20070930 \n", - "24 1353 2078732 2007-11-03 ... 20071105 \n", - "25 1779 2518406 2007-10-02 ... 1 \n", - "\n", - " TP_MOTORA TP_REPETE DS_ALI1 DS_ALI2 \\\n", - "0 2 3 \n", - "1 TORTA DE PALMITO/ FRANGO . \n", - "2 \n", - "3 SALAME \n", - "4 \n", - "5 2 2 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 \n", - "11 2 \n", - "12 2 2 \n", - "13 \n", - "14 SALGADO TORTA PAO COM QUEIJO C \n", - "15 \n", - "16 \n", - "17 \n", - "18 MORTADELA \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", - "25 1 1 \n", - "\n", - " DS_LOCAL1 DS_LOCAL2 DT_ENCERRA \n", - "0 20070222 \n", - "1 20070323 \n", - "2 20070515 \n", - "3 DOMICILIO \n", - "4 20070816 \n", - "5 \n", - "6 20070724 \n", - "7 20070503 \n", - "8 20070716 \n", - "9 20071120 \n", - "10 20070704 \n", - "11 20070730 \n", - "12 20070730 \n", - "13 20070814 \n", - "14 CASA DE MASSAS ROMANA EM SANTO 20070903 \n", - "15 20071025 \n", - "16 \n", - "17 20070918 \n", - "18 DOMICILIO 20071023 \n", - "19 20071017 \n", - "20 \n", - "21 \n", - "22 \n", - "23 20080601 \n", - "24 20080108 \n", - "25 20071112 \n", - "\n", - "[26 rows x 140 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - }, - "vscode": { - "interpreter": { - "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/databases/SINASC.ipynb b/docs/source/databases/SINASC.ipynb deleted file mode 100644 index 097e9c40..00000000 --- a/docs/source/databases/SINASC.ipynb +++ /dev/null @@ -1,693 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# SINASC FTP Database\n", - "##### Sistema de Informações sobre Nascidos Vivos\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from pysus import SINASC\n", - "sinasc = SINASC().load() # Loads the files from DATASUS" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SINASC - Sistema de Informações sobre Nascidos Vivos" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinasc" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'long_name': 'Sistema de Informações sobre Nascidos Vivos',\n", - " 'source': 'http://sinasc.saude.gov.br/',\n", - " 'description': ''}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinasc.metadata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Listing codes & groups" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'DN': 'Declarações de Nascidos Vivos',\n", - " 'DNR': 'Dados dos Nascidos Vivos por UF de residência'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinasc.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Getting specific files " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DNSP1996.DBC,\n", - " DNSP1997.DBC,\n", - " DNSP1998.DBC,\n", - " DNSP1999.DBC,\n", - " DNSP2000.DBC,\n", - " DNSP2001.DBC,\n", - " DNSP2002.DBC,\n", - " DNSP2003.DBC,\n", - " DNSP2004.DBC,\n", - " DNSP2005.dbc,\n", - " DNSP2006.DBC,\n", - " DNSP2007.dbc,\n", - " DNSP2008.dbc,\n", - " DNSP2009.dbc,\n", - " DNSP2010.DBC,\n", - " DNSP2011.DBC,\n", - " DNSP2012.DBC,\n", - " DNSP2013.dbc,\n", - " DNSP2014.dbc,\n", - " DNSP2015.dbc,\n", - " DNSP2016.dbc,\n", - " DNSP2017.dbc,\n", - " DNSP2018.dbc,\n", - " DNSP2019.dbc,\n", - " DNSP2020.dbc,\n", - " DNSP2021.dbc]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinasc.get_files(\"DN\", uf=\"SP\") # or multiple [\"SP\", ...]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DNSP1999.DBC, DNSP2000.DBC]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "files = sinasc.get_files(\"DN\", uf=\"SP\", year=[1999, 2000])\n", - "files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Describing files inside DATASUS server" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'group': 'Declarações de Nascidos Vivos',\n", - " 'last_update': '2020-01-27 12:12PM',\n", - " 'name': 'DNSP1999.DBC',\n", - " 'size': '14.3 MB',\n", - " 'uf': 'São Paulo',\n", - " 'year': 1999}\n", - "{'group': 'Declarações de Nascidos Vivos',\n", - " 'last_update': '2020-01-27 12:12PM',\n", - " 'name': 'DNSP2000.DBC',\n", - " 'size': '14.3 MB',\n", - " 'uf': 'São Paulo',\n", - " 'year': 2000}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "for file in files:\n", - " pprint(sinasc.describe(file))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Downloading files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can rather download multiple files or download them individually:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DNSP2000.parquet: 100%|████████████| 523k/523k [00:15<00:00, 34.3kB/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[/home/bida/pysus/DNSP1999.parquet, /home/bida/pysus/DNSP2000.parquet]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sinasc.download(files) # Downloads to default directory or specify with `local_dir=`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading files\n", - "\n", - "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████| 14.3M/14.3M [00:00<00:00, 6.32GB/s]\n" - ] - } - ], - "source": [ - "parquet = sinasc.download(files)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
contadorLOCNASCCODMUNNASCIDADEMAEESTCIVMAEESCMAECODOCUPMAEQTDFILVIVOQTDFILMORTCODMUNRES...GRAVIDEZPARTOCONSULTASDTNASCSEXOAPGAR1APGAR5RACACORPESOCODANOMAL
09000113550308219203003550308...11820021999209103300
19000213550308239301003550308...11819031999209102300
29000313550308199201003550308...118090319991080943600
39000413550308169200003550308...118150319992091012600
49000513550308339200003550308...12817031999209092700
..................................................................
71442329999613522604189301003522604...114200519991081013400
71442429999713522604372201003522604...124290519991091013800
71442529999813522604219400003522604...114040619992091013500
71442629999913522604342400003522604...123040619992091013800
71442730000013522604379205003522604...118290519992101013100
\n", - "

714428 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " contador LOCNASC CODMUNNASC IDADEMAE ESTCIVMAE ESCMAE CODOCUPMAE \\\n", - "0 90001 1 3550308 21 9 2 \n", - "1 90002 1 3550308 23 9 3 \n", - "2 90003 1 3550308 19 9 2 \n", - "3 90004 1 3550308 16 9 2 \n", - "4 90005 1 3550308 33 9 2 \n", - "... ... ... ... ... ... ... ... \n", - "714423 299996 1 3522604 18 9 3 \n", - "714424 299997 1 3522604 37 2 2 \n", - "714425 299998 1 3522604 21 9 4 \n", - "714426 299999 1 3522604 34 2 4 \n", - "714427 300000 1 3522604 37 9 2 \n", - "\n", - " QTDFILVIVO QTDFILMORT CODMUNRES ... GRAVIDEZ PARTO CONSULTAS \\\n", - "0 03 00 3550308 ... 1 1 8 \n", - "1 01 00 3550308 ... 1 1 8 \n", - "2 01 00 3550308 ... 1 1 8 \n", - "3 00 00 3550308 ... 1 1 8 \n", - "4 00 00 3550308 ... 1 2 8 \n", - "... ... ... ... ... ... ... ... \n", - "714423 01 00 3522604 ... 1 1 4 \n", - "714424 01 00 3522604 ... 1 2 4 \n", - "714425 00 00 3522604 ... 1 1 4 \n", - "714426 00 00 3522604 ... 1 2 3 \n", - "714427 05 00 3522604 ... 1 1 8 \n", - "\n", - " DTNASC SEXO APGAR1 APGAR5 RACACOR PESO CODANOMAL \n", - "0 20021999 2 09 10 3300 \n", - "1 19031999 2 09 10 2300 \n", - "2 09031999 1 08 09 4 3600 \n", - "3 15031999 2 09 10 1 2600 \n", - "4 17031999 2 09 09 2700 \n", - "... ... ... ... ... ... ... ... \n", - "714423 20051999 1 08 10 1 3400 \n", - "714424 29051999 1 09 10 1 3800 \n", - "714425 04061999 2 09 10 1 3500 \n", - "714426 04061999 2 09 10 1 3800 \n", - "714427 29051999 2 10 10 1 3100 \n", - "\n", - "[714428 rows x 21 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "parquet.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - }, - "vscode": { - "interpreter": { - "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa" - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/source/index.rst b/docs/source/index.rst index b1f9446d..9b036f3b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,7 +14,7 @@ Contents: :maxdepth: 2 Data Sources - Tutorials + API Reference Indices and tables diff --git a/pysus/api/__init__.py b/pysus/api/__init__.py index 1af15464..44fc4270 100644 --- a/pysus/api/__init__.py +++ b/pysus/api/__init__.py @@ -1 +1,7 @@ +"""PySUS public API for accessing Brazilian public health data. + +Provides clients for DuckLake, FTP, and DadosGov data sources, +file format handlers, and high-level convenience functions. +""" + from .client import PySUS as PySUSClient # noqa diff --git a/pysus/api/_impl/__init__.py b/pysus/api/_impl/__init__.py index e69de29b..f84b8bfe 100644 --- a/pysus/api/_impl/__init__.py +++ b/pysus/api/_impl/__init__.py @@ -0,0 +1,6 @@ +"""Implementation module for high-level data access functions. + +Exposes convenience functions (sinan, sim, sih, etc.) that +combine catalog querying, downloading, and Parquet reading +into a single call. +""" diff --git a/pysus/api/_impl/databases.py b/pysus/api/_impl/databases.py index fc8aecab..e36b2a11 100644 --- a/pysus/api/_impl/databases.py +++ b/pysus/api/_impl/databases.py @@ -1,3 +1,12 @@ +"""High-level convenience functions for fetching Brazilian health data. + +Each function wraps an asynchronous query/download pipeline and returns a +pandas DataFrame. The available datasets cover disease notification (SINAN), +vital statistics (SINASC, SIM), hospital admissions (SIH), ambulatory care +(SIA), immunisation (PNI), census data (IBGE), health facilities (CNES), +and hospitalisation records (CIHA). +""" + __all__ = [ "sinan", "sinasc", @@ -32,6 +41,8 @@ def _fetch_data( show_progress: bool = True, **kwargs, ) -> pd.DataFrame: + """Query, download, and concatenate Parquet files for a given dataset.""" + async def _fetch(): async with PySUS() as pysus: years = [year] if isinstance(year, int) else (year or [None]) @@ -77,10 +88,61 @@ async def _fetch(): def sinan( - disease: str, + disease: Literal[ + "ACBI", + "ACGR", + "ANIM", + "ANTR", + "BOTU", + "CANC", + "CHAG", + "CHIK", + "COLE", + "COQU", + "DENG", + "DERM", + "DIFT", + "ESQU", + "EXAN", + "FMAC", + "FTIF", + "HANS", + "HANT", + "HEPA", + "IEXO", + "INFL", + "LEIV", + "LEPT", + "LERD", + "LTAN", + "MALA", + "MENI", + "MENT", + "NTRA", + "PAIR", + "PEST", + "PFAN", + "PNEU", + "RAIV", + "SDTA", + "SIFA", + "SIFC", + "SIFG", + "SRC", + "TETA", + "TETN", + "TOXC", + "TOXG", + "TRAC", + "TUBE", + "VARC", + "VIOL", + "ZIKA", + ], year: int | list[int], **kwargs, ) -> pd.DataFrame: + """Fetch SINAN records for a given disease and year(s).""" return _fetch_data( dataset="sinan", group=disease.upper(), @@ -94,6 +156,7 @@ def sinasc( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch SINASC birth certificates for a given state, year(s), and group.""" return _fetch_data( dataset="sinasc", state=state.upper(), @@ -108,6 +171,7 @@ def sim( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch SIM mortality records for a given state, year(s), and group.""" return _fetch_data( dataset="sim", state=state.upper(), @@ -123,6 +187,7 @@ def sih( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch SIH hospital admissions for a state, year, month, and group.""" return _fetch_data( dataset="sih", state=state.upper(), @@ -139,6 +204,7 @@ def sia( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch SIA ambulatory care for a state, year, month, and group.""" return _fetch_data( dataset="sia", state=state.upper(), @@ -154,6 +220,7 @@ def pni( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch PNI immunisation records for a given state, year(s), and group.""" return _fetch_data( dataset="pni", state=state.upper(), @@ -167,6 +234,7 @@ def ibge( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch IBGE census data for given year(s) and optional group.""" return _fetch_data(dataset="ibge", group=group, year=year) @@ -177,6 +245,7 @@ def cnes( group: str | None = None, **kwargs, ) -> pd.DataFrame: + """Fetch CNES health facilities for a state, year, month, and group.""" return _fetch_data( dataset="cnes", state=state.upper(), @@ -193,6 +262,7 @@ def ciha( group: str | None = "CIHA", **kwargs, ) -> pd.DataFrame: + """Fetch CIHA hospitalisation records for state, year, month, and group.""" return _fetch_data( dataset="ciha", state=state.upper(), @@ -220,6 +290,8 @@ def list_files( month: int | list[int] | None = None, **kwargs, ) -> pd.DataFrame: + """List catalog files for a dataset, filtered by group/state/year/month.""" + async def _list(): async with PySUS() as pysus: ducklake = await pysus.get_ducklake() diff --git a/pysus/api/client.py b/pysus/api/client.py index 707817a3..99763a72 100644 --- a/pysus/api/client.py +++ b/pysus/api/client.py @@ -1,3 +1,9 @@ +"""Main orchestrator for the PySUS data pipeline. + +Manages file downloads, local state tracking, catalog attachment, +Parquet conversion, and query execution across multiple backends. +""" + import enum from collections.abc import Callable from datetime import datetime @@ -20,10 +26,12 @@ class Base(DeclarativeBase): - pass + """Base declarative class for SQLAlchemy ORM models.""" class DownloadStatus(enum.Enum): + """Download status values tracked for each local file.""" + PENDING = "pending" DOWNLOADING = "downloading" COMPLETED = "completed" @@ -32,6 +40,8 @@ class DownloadStatus(enum.Enum): class LocalFileState(Base): + """ORM model tracking the state of a downloaded local file.""" + __tablename__ = "local_file_state" path: Mapped[str] = mapped_column(String, primary_key=True) remote_path: Mapped[str] = mapped_column(String, nullable=False) @@ -54,7 +64,11 @@ class LocalFileState(Base): class PySUS: + """Central orchestrator for downloading and querying PySUS datasets.""" + def __init__(self, db_path: Path = CACHEPATH / "config.db"): + """Initialize PySUS with a DuckDB-backed SQLAlchemy engine.""" + db_path = Path(db_path) db_path.parent.mkdir(parents=True, exist_ok=True) @@ -68,6 +82,8 @@ def __init__(self, db_path: Path = CACHEPATH / "config.db"): self._dadosgov: DadosGovClient | None = None async def __aenter__(self): + """Set up DuckLake catalog and return self as async context manager.""" + self._ducklake = DuckLake() await self._ducklake._load_catalog() self._attach_client_catalog( @@ -77,6 +93,8 @@ async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): + """Clean up all client connections and dispose of the engine.""" + if self._ducklake: await self._ducklake.close() if self._ftp: @@ -86,6 +104,8 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): self.engine.dispose() async def get_ducklake(self) -> DuckLake: + """Return the DuckLake client, initializing it lazily if needed.""" + if self._ducklake is None: self._ducklake = DuckLake() await self._ducklake._load_catalog() @@ -96,12 +116,16 @@ async def get_ducklake(self) -> DuckLake: return self._ducklake async def get_dadosgov(self, access_token: str | None) -> DadosGovClient: + """Return the DadosGov client, connecting lazily if needed.""" + if self._dadosgov is None: self._dadosgov = DadosGovClient() await self._dadosgov.connect(token=access_token) return self._dadosgov async def get_ftp(self) -> FTPClient: + """Return the FTP client, connecting lazily if needed.""" + if self._ftp is None: self._ftp = FTPClient() await self._ftp.connect() @@ -111,6 +135,8 @@ async def get_local_file( self, file: BaseRemoteFile, ) -> BaseLocalFile | None: + """Look up a previously downloaded file by its remote path.""" + from pysus.api.extensions import ExtensionFactory client_name = file.client.name.lower() @@ -138,6 +164,8 @@ async def get_local_file( return await ExtensionFactory.instantiate(str(record.path)) def _attach_client_catalog(self, name: str, path: str): + """Attach an external DuckDB catalog to the engine if not attached.""" + abs_path = str(Path(path).absolute()) with self.engine.connect() as conn: q = "SELECT database_name FROM duckdb_databases() WHERE path = ?" @@ -149,6 +177,8 @@ def _attach_client_catalog(self, name: str, path: str): ) def _get_dest_path(self, file: BaseRemoteFile) -> Path: + """Build the local filesystem path for a given remote file.""" + client_name = file.client.name.lower() dataset_name = file.dataset.name.lower() @@ -174,6 +204,8 @@ async def _update_state( state: str | None = None, group: str | None = None, ): + """Create or update the LocalFileState record for a file.""" + with self.Session() as session: record = ( session.query(LocalFileState) @@ -204,6 +236,8 @@ async def download( token: str | None = None, callback: Callable | None = None, ) -> BaseLocalFile: + """Download a remote file and return a local file handle.""" + from pysus.api.extensions import ExtensionFactory existing_local = await self.get_local_file(file) @@ -264,6 +298,8 @@ async def download( ) from e async def _delete_record(self, path: str): + """Delete a LocalFileState record from the database.""" + with self.Session() as session: record = session.query(LocalFileState).filter_by(path=path).first() if record: @@ -276,6 +312,8 @@ async def download_to_parquet( token: str | None = None, callback: Callable[[int, int], None] | None = None, ) -> Parquet: + """Download a file and convert it to Parquet format.""" + local_file = await self.download( file=file, token=token, @@ -308,6 +346,8 @@ async def download_to_parquet( ) def get_local_hierarchy(self): + """Build a nested dict of cached files grouped by client and dataset.""" + with self.Session() as session: records = session.query(LocalFileState).all() @@ -338,6 +378,8 @@ def get_local_hierarchy(self): return hierarchy def get_completed_remote_paths(self) -> set[str]: + """Return remote paths for all successfully downloaded files.""" + with self.Session() as session: records = ( session.query(LocalFileState.remote_path) @@ -354,6 +396,8 @@ async def query( year: int | None = None, month: int | None = None, ): + """Query available datasets through the DuckLake catalog.""" + if self._ducklake is None: await self.get_ducklake() if self._ducklake is not None: @@ -371,6 +415,8 @@ def read_parquet( sql: str | None = None, mode: Literal["union", "intersection", "strict"] = "union", ) -> "DuckDBPyConnection": + """Read Parquet files with optional schema handling and SQL filter.""" + if not paths: raise ValueError("No paths provided") diff --git a/pysus/api/dadosgov/__init__.py b/pysus/api/dadosgov/__init__.py index 1efe5d57..30269ecb 100644 --- a/pysus/api/dadosgov/__init__.py +++ b/pysus/api/dadosgov/__init__.py @@ -1 +1,3 @@ +"""Client for the Brazilian Open Data Portal (dados.gov.br).""" + from .client import DadosGov as DadosGovClient # noqa diff --git a/pysus/api/dadosgov/client.py b/pysus/api/dadosgov/client.py index 0051d9e5..4e6cbde8 100644 --- a/pysus/api/dadosgov/client.py +++ b/pysus/api/dadosgov/client.py @@ -1,3 +1,5 @@ +"""HTTP client and data models for the dados.gov.br API.""" + from __future__ import annotations import pathlib @@ -15,6 +17,7 @@ def to_datetime(value: Any) -> datetime | None: + """Parse a Brazilian date string into a datetime object.""" if not value or not isinstance(value, str) or "Indisponível" in value: return None for fmt in ("%d/%m/%Y %H:%M:%S", "%d/%m/%Y"): @@ -26,6 +29,7 @@ def to_datetime(value: Any) -> datetime | None: def to_bool(value: Any) -> bool: + """Parse a Brazilian Portuguese boolean value ("sim"/"não") into a bool.""" if isinstance(value, bool): return value return str(value).lower() in ("sim", "true", "1") @@ -36,27 +40,34 @@ def to_bool(value: Any) -> bool: class DadosGov(BaseRemoteClient): + """Client for the dados.gov.br open data portal API.""" + base_url: str = "https://dados.gov.br/dados/api" _token: str | None = PrivateAttr(default=None) _client: httpx.AsyncClient | None = PrivateAttr(default=None) def __init__(self, **data): + """Initialize the DadosGov client.""" super().__init__(**data) @property def name(self) -> str: + """Return the short client name.""" return "DadosGov" @property def long_name(self) -> str: + """Return the human-readable client name.""" return "Portal Brasileiro de Dados Abertos" @property def description(self) -> str: + """Return a description of the client.""" return "Interface de acesso ao API do Portal de Dados Abertos" async def connect(self, token: str | None = None) -> None: + """Connect to the dados.gov.br API with the given token.""" _token = token or self._token if not _token: @@ -84,19 +95,23 @@ async def connect(self, token: str | None = None) -> None: ) async def login(self, token: str | None = None, **kwargs) -> None: + """Authenticate with the API (delegates to connect).""" await self.connect(token=token) async def close(self) -> None: + """Close the underlying HTTP client.""" if self._client: await self._client.aclose() self._client = None async def datasets(self, **kwargs) -> list[Dataset]: + """Return a list of pre-configured health datasets.""" from .databases import AVAILABLE_DATABASES return [db_class(client=self) for db_class in AVAILABLE_DATABASES] async def list_datasets(self, **kwargs) -> list[ConjuntoDados]: + """Search and list available datasets from the portal.""" if self._client is None: raise ConnectionError( "Client not connected. Call login(token=...) first.", @@ -121,6 +136,7 @@ async def list_datasets(self, **kwargs) -> list[ConjuntoDados]: return [ConjuntoDados(**item, client=self) for item in data] async def get_dataset(self, id: str) -> ConjuntoDados: + """Fetch a single dataset by its ID.""" if self._client is None: raise ConnectionError( "Client not connected. Call login(token=...) first.", @@ -140,6 +156,7 @@ async def _download_file( output: pathlib.Path, callback: Callable[[int], None] | None = None, ) -> pathlib.Path: + """Download a remote file to a local path.""" if self._client is None: raise ConnectionError( "Client not connected. Call login(token=...) first.", @@ -156,6 +173,8 @@ async def _download_file( class Recurso(BaseModel): + """A single resource (file) within a dataset on dados.gov.br.""" + model_config = ConfigDict(populate_by_name=True) id: str @@ -168,6 +187,7 @@ class Recurso(BaseModel): file_name: str | None = Field(None, alias="nomeArquivo") async def get_size(self) -> int: + """Retrieve the file size from the remote server.""" async with httpx.AsyncClient(follow_redirects=True) as client: response = await client.head(self.url) @@ -182,6 +202,8 @@ async def get_size(self) -> int: class ConjuntoDados(BaseModel): + """A dataset group as returned by the dados.gov.br API.""" + model_config = ConfigDict(populate_by_name=True) client: BaseRemoteClient | None = None diff --git a/pysus/api/dadosgov/databases.py b/pysus/api/dadosgov/databases.py index f8f033bb..cc456593 100644 --- a/pysus/api/dadosgov/databases.py +++ b/pysus/api/dadosgov/databases.py @@ -1,9 +1,13 @@ +"""Pre-configured health database definitions accessible via dados.gov.br.""" + from typing import Any from .models import Dataset class CNES(Dataset): + """Cadastro Nacional de Estabelecimentos de Saúde (CNES).""" + ids: list[str] = [ "40a0d093-b12f-44a4-bdc7-bae8eb54dd04", "9455b341-b06e-408e-8e10-54b32b3d74ec", @@ -11,10 +15,12 @@ class CNES(Dataset): @property def name(self) -> str: + """Return the short name.""" return "CNES" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Cadastro Nacional de Estabelecimentos de Saúde" @property @@ -26,10 +32,13 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() class PNI(Dataset): + """Programa Nacional de Imunizações (PNI).""" + ids: list[str] = [ "2989d396-cb09-47e7-a3b8-a4b951ca0200", "543aa08a-46c4-44e8-802e-198daa30753d", @@ -42,10 +51,12 @@ class PNI(Dataset): @property def name(self) -> str: + """Return the short name.""" return "PNI" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Programa Nacional de Imunizações" @property @@ -53,20 +64,25 @@ def description(self) -> str: return "O PNI monitora a cobertura vacinal e doses aplicadas no Brasil." def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() class SIA(Dataset): + """Sistema de Informações Ambulatoriais (SIA).""" + ids: list[str] = [ "9a335cb7-2b4f-4fce-8947-e8441b4a90af", ] @property def name(self) -> str: + """Return the short name.""" return "SIA" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Sistema de Informações Ambulatoriais" @property @@ -76,10 +92,13 @@ def description(self) -> str: """ def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() class SINAN(Dataset): + """Sistema de Informação de Agravos de Notificação (SINAN).""" + ids: list[str] = [ "4d5e5d44-58a8-4d67-b8aa-4ef1e4b00a1c", "5699abe0-0510-4da8-b47d-209b3bb32b34", @@ -89,10 +108,12 @@ class SINAN(Dataset): @property def name(self) -> str: + """Return the short name.""" return "SINAN" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Sistema de Informação de Agravos de Notificação" @property @@ -103,20 +124,25 @@ def description(self) -> str: """ def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() class SIM(Dataset): + """Sistema de Informação sobre Mortalidade (SIM).""" + ids: list[str] = [ "5f121f4d-47c6-428e-8ec6-e8ec56417172", ] @property def name(self) -> str: + """Return the short name.""" return "SIM" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Sistema de Informação sobre Mortalidade" @property @@ -126,20 +152,25 @@ def description(self) -> str: """ def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() class SINASC(Dataset): + """Sistema de Informações sobre Nascidos Vivos (SINASC).""" + ids: list[str] = [ "441cc6bd-684a-4afd-a88b-ba4734c9e83e", ] @property def name(self) -> str: + """Return the short name.""" return "SINASC" @property def long_name(self) -> str: + """Return the human-readable name.""" return "Sistema de Informações sobre Nascidos Vivos" @property @@ -150,6 +181,7 @@ def description(self) -> str: """ def formatter(self, filename: str) -> dict[str, Any]: + """Extract metadata from a filename (not yet implemented).""" raise NotImplementedError() diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py index 314ddaa4..bc763f4f 100644 --- a/pysus/api/dadosgov/models.py +++ b/pysus/api/dadosgov/models.py @@ -1,3 +1,5 @@ +"""Internal domain models for datasets, groups, and files from dados.gov.br.""" + import asyncio import pathlib from abc import abstractmethod @@ -19,11 +21,14 @@ class File(BaseRemoteFile): + """A downloadable file from a dados.gov.br dataset.""" + record: Recurso type: str = "File" _metadata: dict[str, Any] = PrivateAttr(default_factory=dict) def __init__(self, **data): + """Initialize the File with optional metadata.""" metadata = data.pop("_metadata", {}) super().__init__(**data) self._metadata = metadata @@ -33,6 +38,7 @@ def __repr__(self): return self.basename def model_post_init(self, __context: Any) -> None: + """Fetch remote metadata if size or modify date is missing.""" if not self.record.api_size or not self.record.last_modified: try: loop = asyncio.get_running_loop() @@ -44,16 +50,19 @@ def model_post_init(self, __context: Any) -> None: @property def extension(self) -> str: + """Return the file extension.""" if self.record.file_name: return pathlib.Path(self.record.file_name).suffix return pathlib.Path(self.record.url.split("/")[-1].split("?")[0]).suffix @property def size(self) -> int: + """Return the file size in bytes.""" return self.record.api_size or 0 @property def modify(self) -> datetime: + """Return the last modification date.""" m = self.record.last_modified if not m: raise ValueError("File requires a modify date") @@ -61,17 +70,21 @@ def modify(self) -> datetime: @property def year(self) -> int | None: + """Return the inferred year from metadata.""" return self._metadata.get("year") @property def month(self) -> int | None: + """Return the inferred month from metadata.""" return self._metadata.get("month") @property def state(self) -> State | None: + """Return the inferred state from metadata.""" return self._metadata.get("state") async def fetch_metadata(self) -> None: + """Fetch file size and last-modified from the remote server.""" try: async with httpx.AsyncClient( follow_redirects=True, @@ -102,11 +115,13 @@ async def _download( output: pathlib.Path | None = None, callback: Callable[[int], None] | None = None, ) -> pathlib.Path: + """Download the file to a local path.""" if not output: output = CACHEPATH / self.name return await self.client._download_file(self, output, callback=callback) async def fetch_size(self) -> int: + """Fetch the remote file size and update the local record.""" try: async with httpx.AsyncClient( follow_redirects=True, @@ -130,6 +145,8 @@ async def fetch_size(self) -> int: class Group(BaseRemoteGroup): + """A group of files within a dataset.""" + record: ConjuntoDados _formatter: ( Callable[ @@ -145,6 +162,7 @@ def __init__( dataset: BaseRemoteDataset, formatter: Callable | None = None, ): + """Initialize the Group with a dataset record and optional formatter.""" super().__init__(dataset=dataset) self.record = record self._formatter = formatter @@ -154,17 +172,21 @@ def __repr__(self): @property def name(self) -> str: + """Return the group slug name.""" return self.record.slug @property def long_name(self) -> str: + """Return the group title.""" return self.record.title @property def description(self) -> str: + """Return an empty description.""" return "" async def _fetch_files(self) -> list[BaseRemoteFile]: + """Build File objects from the underlying resources.""" files: list[BaseRemoteFile] = [] for recurso in self.record.resources: metadata = self._formatter(recurso, self) if self._formatter else {} @@ -179,6 +201,8 @@ async def _fetch_files(self) -> list[BaseRemoteFile]: class Dataset(BaseRemoteDataset): + """A health dataset available through dados.gov.br.""" + ids: list[str] = [] client: "DadosGov" @@ -187,9 +211,11 @@ def __repr__(self): @abstractmethod def formatter(self, filename: str) -> dict[str, Any]: + """Extract structured metadata from a filename.""" pass async def _fetch_content(self) -> list[Group]: + """Fetch all groups belonging to this dataset.""" items: list[Group] = [] client: "DadosGov" = self.client if self.ids: diff --git a/pysus/api/ducklake/__init__.py b/pysus/api/ducklake/__init__.py index 2d2f5d16..4ba051d0 100644 --- a/pysus/api/ducklake/__init__.py +++ b/pysus/api/ducklake/__init__.py @@ -1 +1,7 @@ +"""DuckLake subpackage for interacting with the PySUS S3 catalog. + +Provides a DuckDB-based client for querying and downloading +public health datasets stored in object storage. +""" + from .client import DuckLake as DuckLakeClient # noqa diff --git a/pysus/api/ducklake/catalog.py b/pysus/api/ducklake/catalog.py index b6d7d06d..cc83ba3f 100644 --- a/pysus/api/ducklake/catalog.py +++ b/pysus/api/ducklake/catalog.py @@ -1,3 +1,9 @@ +"""SQLAlchemy ORM models for the DuckLake catalog schema. + +Defines tables for datasets, groups, files, and columns stored +in the pysus schema of the local DuckDB catalog. +""" + import enum from datetime import datetime from typing import Optional @@ -18,6 +24,8 @@ class Base(DeclarativeBase): + """Base class for all DuckLake catalog ORM models.""" + pass @@ -41,16 +49,22 @@ class Base(DeclarativeBase): class CatalogTable(Base): + """Abstract base for catalog tables sharing the pysus schema.""" + __abstract__ = True __table_args__: tuple = ({"schema": "pysus"},) class Origin(enum.Enum): + """Origin type for a dataset: FTP or API.""" + FTP = "ftp" API = "api" class CatalogDataset(CatalogTable): + """ORM model for the datasets table, representing a dataset collection.""" + __tablename__ = "datasets" id = Column( @@ -81,6 +95,8 @@ class CatalogDataset(CatalogTable): class ColumnDefinition(CatalogTable): + """ORM model for dataset column metadata (name, type, description).""" + __tablename__ = "dataset_columns" id = Column( @@ -113,6 +129,8 @@ class ColumnDefinition(CatalogTable): class DatasetGroup(CatalogTable): + """ORM model for dataset groups, grouping related files within a dataset.""" + __tablename__ = "dataset_groups" id = Column( @@ -144,6 +162,8 @@ class DatasetGroup(CatalogTable): class CatalogFile(CatalogTable): + """ORM model for the files table, representing individual data files.""" + __tablename__ = "files" id: Mapped[int] = mapped_column( diff --git a/pysus/api/ducklake/client.py b/pysus/api/ducklake/client.py index 8dbb2523..469da9b2 100644 --- a/pysus/api/ducklake/client.py +++ b/pysus/api/ducklake/client.py @@ -1,3 +1,9 @@ +"""High-level client for DuckLake S3-based dataset catalog. + +Provides authentication, catalog synchronization, dataset querying, +and file download capabilities backed by a local DuckDB engine. +""" + from collections.abc import Callable from pathlib import Path from typing import Any @@ -18,6 +24,8 @@ class CatalogDatasetAdapter: + """Adapter wrapping a CatalogDataset ORM record for use by File objects.""" + def __init__(self, catalog_dataset: CatalogDataset, ducklake): self.name = catalog_dataset.name self.long_name = catalog_dataset.long_name or "" @@ -28,10 +36,13 @@ def __init__(self, catalog_dataset: CatalogDataset, ducklake): @property def content(self): + """Query the DuckLake client for files in this dataset.""" return self.ducklake.query(dataset=self.name.upper()) class DatasetGroupAdapter: + """Adapter wrapping a DatasetGroup ORM record for use by File objects.""" + def __init__(self, dataset_group: DatasetGroup, dataset): self.name = dataset_group.name self.long_name = dataset_group.long_name or "" @@ -43,21 +54,28 @@ def __str__(self): @property async def files(self): + """Return the list of files in this group.""" return [] async def _fetch_files(self): + """Fetch files from the remote source for this group.""" return [] async def search(self, **kwargs): + """Search for files within this group matching the given criteria.""" return [] class DuckLakeCredentials(BaseModel): + """Credentials for authenticating with the S3-compatible object storage.""" + access_key: SecretStr secret_key: SecretStr class DuckLake(BaseRemoteClient): + """Client for the DuckLake S3-based public health dataset catalog.""" + endpoint: str = "nbg1.your-objectstorage.com" region: str = "nbg1" bucket: str = "pysus" @@ -71,6 +89,7 @@ class DuckLake(BaseRemoteClient): _Session: Any = PrivateAttr(default=None) def __init__(self, engine=None, **data): + """Initialize the DuckLake client with an optional existing engine.""" super().__init__(**data) self._engine = engine self._cache_dir = Path(CACHEPATH) / "ducklake" @@ -79,29 +98,36 @@ def __init__(self, engine=None, **data): @property def name(self) -> str: + """Return the short name of this client.""" return "DuckLake" @property def long_name(self) -> str: + """Return the human-readable name of this client.""" return "PySUS s3 Client" @property def description(self) -> str: + """Return a description of this client.""" return "" # TODO: @property def catalog_path(self) -> Path: + """Return the local path to the downloaded catalog database.""" return self._catalog_local @property def _catalog_url(self) -> str: + """Return the remote URL of the catalog database file.""" return f"https://{self.endpoint}/{self.bucket}/{self._catalog_remote}" @property def _is_authenticated(self) -> bool: + """Return whether the client has credentials configured.""" return self.credentials is not None async def datasets(self, **kwargs) -> list[DuckDataset]: + """Return all datasets from the catalog as DuckDataset instances.""" if not self._Session: await self.connect() @@ -129,6 +155,7 @@ async def login( secret_key: str | None = None, **kwargs, ) -> None: + """Authenticate with S3 credentials and reconnect to the catalog.""" if access_key and secret_key: self.credentials = DuckLakeCredentials( access_key=SecretStr(access_key), @@ -145,6 +172,7 @@ async def login( ) def _setup_engine(self): + """Create and configure the DuckDB engine with S3 settings.""" engine = create_engine( f"duckdb:///{self._catalog_local}", poolclass=StaticPool, @@ -188,6 +216,7 @@ def _setup_engine(self): return engine async def connect(self, force: bool = False): + """Connect to the catalog, downloading it first if necessary.""" if self._engine and not force: if not self._Session: self._Session = sessionmaker(bind=self._engine) @@ -198,6 +227,7 @@ async def connect(self, force: bool = False): self._Session = sessionmaker(bind=self._engine) async def close(self): + """Dispose the engine, then upload the catalog if authenticated.""" if self._engine: await to_thread.run_sync(self._engine.dispose) @@ -215,6 +245,7 @@ async def _download_file( output: Path, callback: Callable[[int], None] | None = None, ) -> Path: + """Download a single file from object storage to the local path.""" if not isinstance(file, File): raise ValueError("FTP File was not properly instantiated") @@ -230,6 +261,7 @@ async def _download_file( return output async def _download_catalog(self, client: httpx.AsyncClient): + """Download the catalog database from remote storage with retries.""" max_retries = 5 for attempt in range(max_retries): @@ -249,6 +281,7 @@ async def _download_catalog(self, client: httpx.AsyncClient): raise e def _get_s3_client(self): + """Create and return a boto3 S3 client for the configured endpoint.""" if not self.credentials: raise ConnectionError("S3 Credentials not found") return boto3.client( @@ -263,6 +296,7 @@ def _get_s3_client(self): ) async def _load_catalog(self): + """Download remote catalog if the local copy is outdated or missing.""" async with httpx.AsyncClient(follow_redirects=True) as client: local_size = -1 if self._catalog_local.exists(): @@ -280,6 +314,7 @@ async def _load_catalog(self): await self._download_catalog(client) async def _upload_catalog(self): + """Upload the local catalog database to remote storage.""" if not self._is_authenticated: raise PermissionError( "Admin credentials required to upload catalog.", @@ -302,6 +337,7 @@ async def query( year: int | None = None, month: int | None = None, ) -> list[File]: + """Query catalog files by dataset, group, state, year, and/or month.""" if not self._Session: await self.connect() diff --git a/pysus/api/ducklake/models.py b/pysus/api/ducklake/models.py index bb4e8da6..527f2caa 100644 --- a/pysus/api/ducklake/models.py +++ b/pysus/api/ducklake/models.py @@ -1,3 +1,9 @@ +"""Application-level models for DuckLake remote resources. + +Wraps catalog ORM records into BaseRemoteFile, BaseRemoteDataset, +and BaseRemoteGroup interfaces used by the rest of PySUS. +""" + import hashlib from collections.abc import Callable from datetime import datetime @@ -18,6 +24,8 @@ class File(BaseRemoteFile): + """A remote file in DuckLake catalog with download and verification.""" + record: CatalogFile = Field(exclude=True) type: str = "remote" dataset: Any @@ -25,26 +33,32 @@ class File(BaseRemoteFile): @property def basename(self) -> str: + """Return the file name without directory components.""" return self.path.name @property def extension(self) -> str: + """Return the file extension including the leading dot.""" return self.path.suffix @property def size(self) -> int: + """Return the file size in bytes.""" return self.record.size @property def modify(self) -> datetime: + """Return the last-modified timestamp.""" return self.record.modified @property def rows(self) -> int: + """Return the number of rows in the file.""" return self.record.rows @property def sha256(self) -> str | None: + """Return the SHA-256 hash of the file, if available.""" return self.record.sha256 async def _download( @@ -52,6 +66,7 @@ async def _download( output: Path | None = None, callback: Callable[[int], None] | None = None, ) -> Path: + """Download the file from object storage to the given output path.""" if not output: output = CACHEPATH / self.name @@ -62,6 +77,7 @@ async def _download( ) async def verify(self, path: Path) -> bool: + """Verify the file matches the recorded SHA-256 hash.""" if not self.sha256: return True @@ -77,6 +93,8 @@ def _calculate(): class DuckDataset(BaseRemoteDataset): + """A dataset from the DuckLake catalog, containing groups and files.""" + record: CatalogDataset = Field(exclude=True) client: BaseRemoteClient = Field(exclude=True) @@ -85,10 +103,12 @@ def __repr__(self) -> str: @property def name(self) -> str: + """Return the short name of the dataset.""" return self.record.name @property def long_name(self) -> str: + """Return the human-readable name of the dataset.""" return ( self.record.dataset_metadata.long_name if self.record.dataset_metadata @@ -97,6 +117,7 @@ def long_name(self) -> str: @property def description(self) -> str: + """Return the description of the dataset.""" return ( self.record.dataset_metadata.description if self.record.dataset_metadata @@ -104,6 +125,7 @@ def description(self) -> str: ) async def _fetch_content(self) -> list[Union["DuckGroup", File]]: + """Fetch groups and files belonging to this dataset.""" items: list[Union["DuckGroup", File]] = [] if self.record.groups: @@ -127,15 +149,19 @@ async def _fetch_content(self) -> list[Union["DuckGroup", File]]: class DuckGroup(BaseRemoteGroup): + """A group of related files within a DuckLake dataset.""" + record: DatasetGroup = Field(exclude=True) dataset: DuckDataset = Field(exclude=True) @property def name(self) -> str: + """Return the short name of the group.""" return self.record.name @property def long_name(self) -> str: + """Return the human-readable name of the group.""" return ( self.record.group_metadata.long_name if self.record.group_metadata @@ -144,11 +170,13 @@ def long_name(self) -> str: @property def description(self) -> str: + """Return the description of the group.""" if self.record.group_metadata: return self.record.group_metadata.description return "" async def _fetch_files(self) -> list[BaseRemoteFile]: + """Fetch the list of files belonging to this group.""" files: list[BaseRemoteFile] = [ File( path=f.path, diff --git a/pysus/api/extensions.py b/pysus/api/extensions.py index d4739e53..d1029049 100644 --- a/pysus/api/extensions.py +++ b/pysus/api/extensions.py @@ -1,3 +1,5 @@ +"""Map file extensions and MIME types to their handler classes.""" + import asyncio import csv import ctypes.util @@ -40,15 +42,20 @@ class File(BaseLocalFile): + """Represents a generic local file with no special handling.""" + type: FileType = Field("FILE") async def load(self) -> bytes: + """Read the entire file contents into memory as bytes.""" return await to_thread.run_sync(self.path.read_bytes) async def stream( self, chunk_size: int = 1024 * 1024, ) -> AsyncGenerator[bytes, None]: + """Yield the file contents in chunks of the given size.""" + def _read_sync(): with open(self.path, "rb") as f: while chunk := f.read(chunk_size): @@ -60,12 +67,16 @@ def _read_sync(): class Directory(BaseLocalFile): + """Represents a directory on the local filesystem.""" + type: FileType = Field("DIR") def __repr__(self) -> str: + """Return the directory name with a trailing slash.""" return f"{self.basename}/" async def load(self) -> list[BaseLocalFile]: + """Load all entries inside the directory as file objects.""" from pysus.api.extensions import ExtensionFactory if not self.path.exists(): @@ -79,6 +90,7 @@ async def stream( self, chunksize: int = 10000, ) -> AsyncGenerator[BaseLocalFile, None]: + """Yield each entry inside the directory as a file object.""" from pysus.api.extensions import ExtensionFactory for p in self.path.iterdir(): @@ -86,17 +98,21 @@ async def stream( class CSV(BaseTabularFile): + """Represents a CSV file with automatic encoding and separator detection.""" + type: FileType = Field("CSV") _encoding: str | None = PrivateAttr(default=None) _sep: str | None = PrivateAttr(default=None) @property def columns(self) -> list[str]: + """Return the column names from the CSV header row.""" df = pd.read_csv(self.path, sep=",", nrows=0) return df.columns.tolist() @property def rows(self) -> int: + """Return the number of data rows in the file.""" count = 0 with open(self.path, "rb") as f: for _ in f: @@ -104,6 +120,7 @@ def rows(self) -> int: return max(0, count - 1) async def _get_encoding(self) -> str: + """Detect and cache the file's character encoding.""" if self._encoding is None: def detect(): @@ -115,6 +132,7 @@ def detect(): return self._encoding async def _get_sep(self) -> str: + """Sniff and cache the CSV delimiter.""" if self._sep is None: encoding = await self._get_encoding() @@ -131,6 +149,7 @@ def sniff(): return self._sep async def load(self) -> pd.DataFrame: + """Read the entire CSV into a DataFrame.""" encoding = await self._get_encoding() separator = await self._get_sep() @@ -145,6 +164,7 @@ async def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[pd.DataFrame, None]: + """Yield the CSV in chunks of the given number of rows.""" encoding = await self._get_encoding() separator = await self._get_sep() @@ -165,21 +185,28 @@ def _get_reader_sync(): class Parquet(BaseTabularFile): + """Represents a Parquet file with optional date and integer type parsing.""" + type: FileType = Field("PARQUET") @property def schema(self) -> pa.Schema: + """Return the Parquet schema as a PyArrow Schema object.""" return pq.read_schema(self.path) @property def columns(self) -> list[str]: + """Return the column names from the Parquet schema.""" return pq.read_schema(self.path).names @property def rows(self) -> int: + """Return the number of rows from the Parquet metadata.""" return pq.read_metadata(self.path).num_rows async def load(self, parse: bool = True) -> pd.DataFrame: + """Read the entire Parquet file into a DataFrame.""" + def _load(): df = pd.read_parquet(self.path, engine="pyarrow") return self.parse_dftypes(df) if parse else df @@ -189,6 +216,7 @@ def _load(): async def stream( self, chunk_size: int = 10000, parse: bool = False ) -> AsyncGenerator[pd.DataFrame, None]: + """Yield the Parquet file in batches of the given size.""" parquet_file = await to_thread.run_sync(pq.ParquetFile, self.path) if parquet_file.metadata.num_row_groups == 0: @@ -203,6 +231,8 @@ async def stream( @staticmethod def parse_dftypes(df: pd.DataFrame) -> pd.DataFrame: + """Convert known date and integer columns to their proper types.""" + def str_to_int(string): if pd.isna(string): return string @@ -232,17 +262,22 @@ def str_to_date(string): class DBF(BaseTabularFile): + """Represents a dBASE (DBF) file.""" + type: FileType = Field("DBF") @property def columns(self) -> list[str]: + """Return the field names from the DBF file.""" return DBFReader(self.path, load=False).field_names @property def rows(self) -> int: + """Return the number of records in the DBF file.""" return len(DBFReader(self.path, load=False)) def decode_column(self, value): + """Decode a byte string value using cp1252 encoding.""" if isinstance(value, bytes): return ( value.decode(encoding="cp1252", errors="replace") @@ -254,6 +289,8 @@ def decode_column(self, value): return value async def load(self) -> pd.DataFrame: + """Read the entire DBF file into a DataFrame.""" + def _load(): dbf = DBFReader(self.path, encoding="cp1252", raw=True) df = pd.DataFrame(iter(dbf)) @@ -265,6 +302,8 @@ async def stream( self, chunk_size: int = 30000, ) -> AsyncGenerator[pd.DataFrame, None]: + """Yield the DBF records in chunks of the given size.""" + def _get_db(): return DBFReader(self.path, encoding="cp1252", raw=True) @@ -286,6 +325,7 @@ async def to_parquet( chunk_size: int = 30000, callback: Callable[[int, int], None] | None = None, ) -> "Parquet": + """Convert the DBF file to Parquet format.""" from pysus.api.extensions import ExtensionFactory out = ( @@ -349,21 +389,26 @@ async def _stream_to_single_file(): class DBC(BaseTabularFile): + """Represents a compressed DBC file, convertible to DBF then Parquet.""" + type: FileType = Field("DBC") @property def columns(self) -> list[str]: + """Not supported for DBC files. Convert to Parquet first.""" raise NotImplementedError( "DBC metadata cannot be read directly. Convert to Parquet first." ) @property def rows(self) -> int: + """Not supported for DBC files. Convert to Parquet first.""" raise NotImplementedError( "DBC metadata cannot be read directly. Convert to Parquet first." ) async def load(self) -> pd.DataFrame: + """Convert to Parquet and load the result as a DataFrame.""" parquet = await self.to_parquet() return await parquet.load() @@ -371,6 +416,7 @@ async def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[pd.DataFrame, None]: + """Convert to Parquet and stream its chunks.""" parquet = await self.to_parquet() async for chunk in parquet.stream(chunk_size=chunk_size): yield chunk @@ -381,6 +427,7 @@ async def to_parquet( chunk_size: int = 30000, callback: Callable[[int, int], None] | None = None, ) -> "Parquet": + """Decompress DBC to DBF, then convert to Parquet.""" from pysus.api.extensions import ExtensionFactory if output_path is None: @@ -414,10 +461,13 @@ async def to_parquet( class JSON(BaseTabularFile): + """Represents a JSON file with tabular data.""" + type: FileType = Field("JSON") @property def columns(self) -> list[str]: + """Return the column names from the JSON file.""" df = ( pd.read_json(self.path, nrows=0) if self.path.stat().st_size > 0 @@ -427,27 +477,35 @@ def columns(self) -> list[str]: @property def rows(self) -> int: + """Return the number of rows in the JSON file.""" return len(pd.read_json(self.path)) async def load(self) -> pd.DataFrame: + """Read the entire JSON file into a DataFrame.""" return await to_thread.run_sync(pd.read_json, self.path) async def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[pd.DataFrame, None]: + """Yield the entire JSON file as a single DataFrame.""" yield await self.load() class PDF(BaseLocalFile): + """Represents a PDF file.""" + type: FileType = Field("PDF") async def load(self) -> bytes: + """Read the entire PDF file contents into memory as bytes.""" return await to_thread.run_sync(self.path.read_bytes) async def stream( self, chunk_size: int | None = None ) -> AsyncGenerator[bytes, None]: + """Yield the PDF file contents in chunks of the given size.""" + def _read(): with open(self.path, "rb") as f: if chunk_size: @@ -462,12 +520,17 @@ def _read(): class Zip(BaseCompressedFile): + """Represents a ZIP archive file.""" + type: FileType = Field("ZIP") async def load(self) -> zipfile.ZipFile: + """Open and return the ZIP archive.""" return await to_thread.run_sync(zipfile.ZipFile, self.path) async def list_members(self) -> list[str]: + """Return the list of member names inside the archive.""" + def _list(): with zipfile.ZipFile(self.path) as z: return z.namelist() @@ -475,6 +538,8 @@ def _list(): return await to_thread.run_sync(_list) async def open_member(self, member_name: str) -> bytes: + """Read and return the contents of a named archive member.""" + def _read(): with zipfile.ZipFile(self.path) as z: return z.read(member_name) @@ -485,6 +550,7 @@ async def extract( self, target_dir: Path = CACHEPATH, ) -> list[BaseLocalFile]: + """Extract members to a target directory and return as file objects.""" from pysus.api.extensions import ExtensionFactory target_dir = Path(target_dir).expanduser().resolve() @@ -506,6 +572,7 @@ async def to_parquet( chunk_size: int = 30000, callback: Callable[[int, int], None] | None = None, ) -> "Parquet": + """Extract the archive and convert the first tabular file to Parquet.""" final_output = ( Path(output_path or self.path.with_suffix(".parquet")) .expanduser() @@ -535,6 +602,8 @@ async def to_parquet( await self._safe_cleanup(temp_dir) async def _safe_cleanup(self, directory: Path): + """Remove a temporary directory and its contents.""" + def _cleanup(): if not directory.exists(): return @@ -555,9 +624,13 @@ def _cleanup(): class GZip(BaseCompressedFile): + """Represents a GZip-compressed file.""" + type: FileType = Field("ZIP") async def load(self) -> bytes: + """Decompress and read the entire file contents into memory.""" + def _read(): with gzip.open(self.path, "rb") as f: return f.read() @@ -565,15 +638,19 @@ def _read(): return await to_thread.run_sync(_read) async def list_members(self) -> list[str]: + """Return a list containing the single decompressed file name.""" return [self.path.stem] async def open_member(self, member_name: str) -> bytes: + """Read and return the decompressed file contents.""" return await self.load() async def extract( self, target_dir: Path = CACHEPATH, ) -> list[BaseLocalFile]: + """Decompress the file to a target directory + and return it as a file object.""" from pysus.api.extensions import ExtensionFactory target_dir.mkdir(parents=True, exist_ok=True) @@ -594,12 +671,17 @@ def _decompress(): class Tar(BaseCompressedFile): + """Represents a Tar archive file.""" + type: FileType = Field("ZIP") async def load(self) -> tarfile.TarFile: + """Open and return the tar archive.""" return await to_thread.run_sync(tarfile.open, self.path) async def list_members(self) -> list[str]: + """Return the list of member names inside the archive.""" + def _list(): with tarfile.open(self.path) as t: return t.getnames() @@ -607,6 +689,8 @@ def _list(): return await to_thread.run_sync(_list) async def open_member(self, member_name: str) -> bytes: + """Read and return the contents of a named archive member.""" + def _read(): with tarfile.open(self.path) as t: f = t.extractfile(member_name) @@ -618,6 +702,7 @@ async def extract( self, target_dir: Path = CACHEPATH, ) -> list[BaseLocalFile]: + """Extract members to a target directory and return as file objects.""" from pysus.api.extensions import ExtensionFactory target_dir.mkdir(parents=True, exist_ok=True) @@ -633,6 +718,8 @@ def _extract(): class FTPNotImported(BaseTabularFile): + """Placeholder for DBC files when optional dependency is not installed.""" + path: Path = Field(default_factory=lambda: Path("...")) type: str | FileType = Field(default="remote") import_err: ClassVar[ @@ -645,35 +732,44 @@ class FTPNotImported(BaseTabularFile): @property def name(self) -> str: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) @property def extension(self) -> str: + """Return the .dbc extension.""" return ".dbc" @property def size(self) -> int: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) @property def modify(self) -> datetime: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) @property def columns(self) -> list[str]: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) @property def rows(self) -> int: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) async def load(self) -> pd.DataFrame: + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[pd.DataFrame, None]: + """Raise ImportError indicating the missing DBC dependency.""" + async def _internal_gen(): raise ImportError(self.import_err) yield pd.DataFrame() @@ -686,11 +782,13 @@ async def to_parquet( chunk_size: int = 10000, callback: Callable[[int, int], None] | None = None, ) -> Parquet: - + """Raise ImportError indicating the missing DBC dependency.""" raise ImportError(self.import_err) class ExtensionFactory: + """Factory that maps file extensions and MIME types to handler classes.""" + _mime: dict[str, type[BaseLocalFile]] = { "application/zip": Zip, "application/x-gzip": GZip, @@ -716,6 +814,7 @@ class ExtensionFactory: @classmethod async def _identify(cls, path: Path) -> type[BaseLocalFile] | None: + """Identify the file class by its MIME type.""" try: mime = await to_thread.run_sync( magic.from_file, @@ -728,6 +827,7 @@ async def _identify(cls, path: Path) -> type[BaseLocalFile] | None: @classmethod async def get_file_class(cls, path: Path) -> type[BaseLocalFile]: + """Return handler class for path, falling back to extension matching.""" mime_class = await cls._identify(path) if mime_class: return mime_class @@ -738,6 +838,7 @@ async def get_file_class(cls, path: Path) -> type[BaseLocalFile]: @classmethod async def instantiate(cls, path: str | Path) -> BaseLocalFile: + """Create and return the appropriate file handler for a given path.""" path = Path(path).expanduser().resolve() if await to_thread.run_sync(path.is_dir): return Directory(path=path, type="DIR") diff --git a/pysus/api/ftp/__init__.py b/pysus/api/ftp/__init__.py index d30edd44..f3138a4f 100644 --- a/pysus/api/ftp/__init__.py +++ b/pysus/api/ftp/__init__.py @@ -1 +1,3 @@ +"""FTP subpackage providing an async client for DATASUS datasets.""" + from .client import FTP as FTPClient # noqa diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py index e7d348ae..0329038b 100644 --- a/pysus/api/ftp/client.py +++ b/pysus/api/ftp/client.py @@ -1,3 +1,5 @@ +"""Async FTP client wrapping the standard ftplib for DATASUS data access.""" + from __future__ import annotations import pathlib @@ -17,12 +19,16 @@ class FTPGroupInfo(TypedDict): + """Metadata describing a file group within a dataset.""" + name: str long_name: str | None description: str | None class FTPFileInfo(TypedDict): + """Parsed metadata for a file or directory entry from an FTP listing.""" + name: str size: int type: str @@ -34,20 +40,25 @@ class FTPFileInfo(TypedDict): class FTP(BaseRemoteClient): + """Async FTP client for navigating and downloading DATASUS data.""" + host: str = "ftp.datasus.gov.br" _ftp: FTPLib | None = PrivateAttr(default=None) @property def name(self) -> str: + """Return the short name of this client.""" return "FTP" @property def long_name(self) -> str: + """Return the human-readable name of this client.""" return "Pysus FTP Client" @property def description(self) -> str: + """Return a description of this client's purpose.""" return """ O cliente FTP do pysus foi desenvolvido para fornecer uma interface assíncrona e moderna para navegação e extração de dados diretamente @@ -58,9 +69,12 @@ def description(self) -> str: @property def ftp(self) -> FTPLib | None: + """Return the underlying ftplib.FTP, or None if not connected.""" return self._ftp async def connect(self) -> None: + """Establish the FTP connection to the remote host.""" + def _connect(): if self.ftp is None: self._ftp = FTPLib(self.host) @@ -69,9 +83,12 @@ def _connect(): await to_thread.run_sync(_connect) async def login(self, **kwargs) -> None: + """Authenticate and connect to the FTP server (alias for connect).""" await self.connect() async def close(self) -> None: + """Close the FTP connection and reset the internal client state.""" + def _close(): if self.ftp: try: @@ -84,6 +101,7 @@ def _close(): await to_thread.run_sync(_close) async def datasets(self, **kwargs) -> list[Dataset]: + """Return a list of all available dataset instances for this client.""" from .databases import AVAILABLE_DATABASES if self.ftp is None: @@ -100,6 +118,8 @@ async def _download_file( output: pathlib.Path, callback: Callable[..., None] | None = None, ) -> pathlib.Path: + """Download a remote file locally, optionally reporting progress.""" + async def _fetch(): try: self.ftp.voidcmd("NOOP") @@ -128,6 +148,7 @@ def _line_parser( file_line: str, formatter: Callable[[str], dict[str, Any]] | None = None, ) -> FTPFileInfo: + """Parse a line from a DATASUS FTP LIST response into FTPFileInfo.""" parts = file_line.strip().split() if len(parts) < 4: raise ValueError(f"Invalid FTP line: {file_line}") @@ -165,6 +186,8 @@ async def _list_directory( path: str, formatter: Callable[[str], dict[str, Any]] | None = None, ) -> list[FTPFileInfo]: + """List the contents of a remote directory and parse each entry.""" + def _list(): self.ftp.cwd(path) lines = [] diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py index 5b042dad..c08b09de 100644 --- a/pysus/api/ftp/databases.py +++ b/pysus/api/ftp/databases.py @@ -1,3 +1,5 @@ +"""DATASUS FTP dataset definitions with filename parsers for each database.""" + from typing import Any from pysus.api.ftp.models import Dataset, Directory @@ -5,6 +7,8 @@ class CIHA(Dataset): + """Comunicação de Internação Hospitalar e Ambulatorial (CIHA).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/CIHA/201101_/Dados"), ] @@ -15,14 +19,17 @@ class CIHA(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "CIHA" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Comunicação de Internação Hospitalar e Ambulatorial" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return ( "A CIHA foi criada para ampliar o processo de planejamento, " "programação, controle, avaliação e regulação da assistência à " @@ -31,6 +38,7 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: + """Parse a CIHA filename into group, state, year and month metadata.""" try: name = filename.split(".")[0].upper() group_code = name[:4] @@ -57,6 +65,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class CNES(Dataset): + """Cadastro Nacional de Estabelecimentos de Saúde (CNES).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/CNES/200508_/Dados"), ] @@ -78,14 +88,17 @@ class CNES(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "CNES" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Cadastro Nacional de Estabelecimentos de Saúde" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return ( "O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o " "sistema de informação oficial de cadastramento de informações " @@ -93,6 +106,7 @@ def description(self) -> str: ) def formatter(self, filename: str) -> dict[str, Any]: + """Parse a CNES filename into group, state, year and month metadata.""" try: name = filename.split(".")[0].upper() group_code = name[:2] @@ -118,6 +132,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class SINASC(Dataset): + """Sistema de Informações sobre Nascidos Vivos (SINASC).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/SINASC/NOV/DNRES"), Directory("/dissemin/publicos/SINASC/ANT/DNRES"), @@ -129,20 +145,24 @@ class SINASC(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "SINASC" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Sistema de Informações sobre Nascidos Vivos" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return """ O SINASC fornece subsídios para o diagnóstico de saúde e planejamento de políticas. """ def formatter(self, filename: str) -> dict[str, Any]: + """Parse a SINASC filename into group, state and year metadata.""" try: name = filename.split(".")[0].upper() year_short = name[-2:] @@ -162,6 +182,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class SIM(Dataset): + """Sistema de Informação sobre Mortalidade (SIM).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/SIM/CID10/DORES"), Directory("/dissemin/publicos/SIM/CID9/DORES"), @@ -173,17 +195,21 @@ class SIM(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "SIM" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Sistema de Informação sobre Mortalidade" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return "O SIM coleta dados sobre obitos no pais para analise epidemiologica." # noqa def formatter(self, filename: str) -> dict[str, Any]: + """Parse a SIM filename into group, state and year metadata.""" try: name = filename.split(".")[0].upper() if "CID9" in filename: @@ -204,6 +230,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class PNI(Dataset): + """Programa Nacional de Imunizações (PNI).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/PNI/DADOS"), ] @@ -214,17 +242,21 @@ class PNI(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "PNI" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Programa Nacional de Imunizações" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return "O SI-PNI monitora a cobertura vacinal e doses aplicadas." def formatter(self, filename: str) -> dict[str, Any]: + """Parse a PNI filename into group, state and year metadata.""" try: name = filename.split(".")[0].upper() group_code, state, year_short = name[:4], name[4:6], name[-2:] @@ -242,6 +274,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class IBGEDATASUS(Dataset): + """População Residente e Projeções (IBGE).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/IBGE/POP"), Directory("/dissemin/publicos/IBGE/censo"), @@ -262,17 +296,21 @@ class IBGEDATASUS(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "IBGE" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "População Residente e Projeções (IBGE)" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return "Informações sobre a população residente obtidas de Censos." def formatter(self, filename: str) -> dict[str, Any]: + """Parse an IBGE filename into group and year metadata.""" try: name = filename.split(".")[0].upper() year = name[-2:] @@ -295,6 +333,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class SIA(Dataset): + """Sistema de Informações Ambulatoriais — outpatient information system.""" + paths: list[Directory] = [ Directory("/dissemin/publicos/SIASUS/199407_200712/Dados"), Directory("/dissemin/publicos/SIASUS/200801_/Dados"), @@ -311,17 +351,21 @@ class SIA(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "SIA" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Sistema de Informações Ambulatoriais" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return "O SIA acompanha as ações de saúde produzidas." def formatter(self, filename: str) -> dict[str, Any]: + """Parse an SIA filename into group, state, year and month metadata.""" try: name = filename.split(".")[0].upper() digits = "".join([d for d in name if d.isdigit()]) @@ -350,6 +394,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class SIH(Dataset): + """Sistema de Informações Hospitalares (SIH).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/SIHSUS/199201_200712/Dados"), Directory("/dissemin/publicos/SIHSUS/200801_/Dados"), @@ -363,19 +409,23 @@ class SIH(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "SIH" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Sistema de Informações Hospitalares" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return """ O SIH processa as internações hospitalares financiadas pelo SUS. """ def formatter(self, filename: str) -> dict[str, Any]: + """Parse an SIH filename into group, state, year and month metadata.""" try: name = filename.split(".")[0].upper() group_code = name[:2] @@ -397,6 +447,8 @@ def formatter(self, filename: str) -> dict[str, Any]: class SINAN(Dataset): + """Sistema de Informação de Agravos de Notificação (SINAN).""" + paths: list[Directory] = [ Directory("/dissemin/publicos/SINAN/DADOS/FINAIS"), Directory("/dissemin/publicos/SINAN/DADOS/PRELIM"), @@ -456,17 +508,21 @@ class SINAN(Dataset): @property def name(self) -> str: + """Return the dataset short name.""" return "SINAN" @property def long_name(self) -> str: + """Return the dataset full name in Portuguese.""" return "Sistema de Informação de Agravos de Notificação" @property def description(self) -> str: + """Return a description of the dataset's purpose.""" return "O SINAN é alimentado pela notificação de doenças compulsórias." def formatter(self, filename: str) -> dict[str, Any]: + """Parse a SINAN filename into group and year metadata.""" try: name = filename.split(".")[0].upper() year_short = name[-2:] diff --git a/pysus/api/ftp/models.py b/pysus/api/ftp/models.py index 0f5ec5dd..4efb0349 100644 --- a/pysus/api/ftp/models.py +++ b/pysus/api/ftp/models.py @@ -1,3 +1,5 @@ +"""Data model classes for FTP directories, files, groups and datasets.""" + from __future__ import annotations import os @@ -21,9 +23,12 @@ class File(BaseRemoteFile): + """A single file on the DATASUS FTP server with parsed metadata.""" + _info: FTPFileInfo = PrivateAttr() def __init__(self, **data): + """Initialise the File with raw FTP metadata.""" info = data.pop("_info", None) if "path" not in data and info and "path" in info: data["path"] = info["path"] @@ -42,18 +47,22 @@ def __init__(self, **data): ) def __repr__(self) -> str: + """Return the file name as its string representation.""" return self.name @property def extension(self) -> str: + """Return the file extension (e.g. .dbc, .dbf).""" return Path(self.path).suffix @property def size(self) -> int: + """Return the file size in bytes.""" return self._info.get("size", 0) @property def modify(self) -> datetime: + """Return the last modification timestamp.""" m = self._info.get("modify") if not m: raise ValueError("File requires a modify date") @@ -61,14 +70,17 @@ def modify(self) -> datetime: @property def year(self) -> int | None: + """Return the data year extracted from the filename, if available.""" return self._info.get("year") @property def month(self) -> int | None: + """Return the data month extracted from the filename, if available.""" return self._info.get("month") @property def state(self) -> State | None: + """Return the state code extracted from the filename, if available.""" return self._info.get("state", None) async def _download( @@ -76,6 +88,7 @@ async def _download( output: Path | None = None, callback: Callable[[int], None] | None = None, ) -> Path: + """Download this file to a local path, optionally reporting progress.""" if output is None: cache_dir = Path(CACHEPATH) cache_dir.mkdir(parents=True, exist_ok=True) @@ -85,6 +98,8 @@ async def _download( class Directory: + """A remote FTP directory lazily loaded into files and subdirectories.""" + def __init__( self, path: str, @@ -93,6 +108,7 @@ def __init__( formatter: Callable | None = None, dataset: Dataset | None = None, ): + """Initialise the Directory with a remote path and optional context.""" self.path = os.path.normpath(path) self.parent = parent self.dataset = dataset or getattr(parent, "dataset", None) @@ -104,11 +120,13 @@ def __init__( @property async def content(self) -> list[Directory | File]: + """Return the directory contents, loading from FTP if not yet cached.""" if not self.loaded: await self.load() return self._content async def load(self) -> None: + """Fetch and parse the directory listing from the FTP server.""" if not isinstance(self.client, FTP): raise ValueError("no ftp client found") raw_infos = await self.client._list_directory( @@ -144,13 +162,17 @@ async def load(self) -> None: self.loaded = True def __str__(self) -> str: + """Return the normalised directory path.""" return self.path def __repr__(self) -> str: + """Return a debug representation of this directory.""" return f"" class Group(BaseRemoteGroup): + """A group of related files within a dataset (e.g. all files of a type).""" + path: str _name: str = PrivateAttr() _long_name: str = PrivateAttr() @@ -166,6 +188,7 @@ def __init__( description: str = "", **data: Any, ): + """Initialise the Group with metadata and a directory reference.""" data.update({"dataset": dataset, "path": path}) super().__init__(**data) @@ -182,51 +205,59 @@ def __init__( @property def name(self) -> str: + """Return the group short code (e.g. 'RD', 'PA').""" return self._name @property def long_name(self) -> str: + """Return the human-readable group name.""" return self._long_name @property def description(self) -> str: + """Return the group description.""" return self._description @property async def content(self) -> list[Directory | File]: + """Return the contents of the underlying directory.""" return await self._dir.content async def _fetch_files(self) -> list[BaseRemoteFile]: + """Return only the file entries from this group's directory.""" items = await self.content return [item for item in items if isinstance(item, BaseRemoteFile)] class Dataset(BaseRemoteDataset, ABC): + """Abstract base for a DATASUS dataset, providing file discovery via FTP.""" + paths: list[Directory] = [] group_definitions: dict[str, str] = {} @property @abstractmethod def name(self) -> str: - pass + """Return the dataset short name.""" @property @abstractmethod def long_name(self) -> str: - pass + """Return the dataset full name in Portuguese.""" @property @abstractmethod def description(self) -> str: - pass + """Return a description of the dataset's purpose.""" @abstractmethod def formatter(self, filename: str) -> dict[str, Any]: - pass + """Parse a filename into metadata (group, state, year, etc.).""" async def _fetch_content( self, ) -> Sequence[BaseRemoteGroup | BaseRemoteFile]: + """Walk the dataset's root directories and return groups and files.""" results: list[BaseRemoteGroup | BaseRemoteFile] = [] for root_dir in self.paths: @@ -258,4 +289,5 @@ async def _fetch_content( return results def __repr__(self) -> str: + """Return the dataset short name as its string representation.""" return self.name diff --git a/pysus/api/models.py b/pysus/api/models.py index aa7bbefe..e3771391 100644 --- a/pysus/api/models.py +++ b/pysus/api/models.py @@ -1,3 +1,12 @@ +"""Abstract model hierarchy for PySUS data access. + +Provides abstract base classes for local and remote file handling, organized +in a layered hierarchy: BaseFile -> BaseLocalFile -> BaseTabularFile / +BaseCompressedFile for local files, and BaseFile -> BaseRemoteFile for remote +files, alongside BaseRemoteObject -> BaseRemoteGroup / BaseRemoteDataset / +BaseRemoteClient for remote data catalogs. +""" + from __future__ import annotations import asyncio @@ -23,6 +32,11 @@ class BaseFile(BaseModel, ABC): + """Abstract base for a single file, local or remote. + + Subclasses must implement *name*, *extension*, *size*, and *modify*. + """ + model_config = ConfigDict( arbitrary_types_allowed=True, validate_assignment=True, @@ -34,10 +48,11 @@ class BaseFile(BaseModel, ABC): @property @abstractmethod def name(self) -> str: - pass + """Return the display name of the file.""" @property def basename(self) -> str: + """Return the file name from the path.""" return self.path.name def __str__(self) -> str: @@ -46,20 +61,25 @@ def __str__(self) -> str: @property @abstractmethod def extension(self) -> str: - pass + """Return the file extension string.""" @property @abstractmethod def size(self) -> int: - pass + """Return the file size in bytes.""" @property @abstractmethod def modify(self) -> datetime: - pass + """Return the last modification timestamp.""" class BaseLocalFile(BaseFile, ABC): + """Abstract base for a file stored on the local filesystem. + + Subclasses must implement *load* and *stream*. + """ + path: Path @property @@ -69,6 +89,13 @@ def name(self) -> str: async def get_hash( self, algorithm: str = "sha256", chunk_size: int = 1024 * 1024 ) -> str: + """Compute the file's hash digest. + + *algorithm* is the hash algorithm name (default "sha256"). + *chunk_size* is the read chunk size in bytes. + Return the hex digest string. + """ + def _compute_hash(): hash_obj = hashlib.new(algorithm) with open(self.path, "rb") as f: @@ -80,14 +107,14 @@ def _compute_hash(): @abstractmethod async def load(self) -> Any: - pass + """Load the entire file content into memory and return it.""" @abstractmethod def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[Any, None]: - pass + """Yield chunks of the file content as an async generator.""" @property def extension(self) -> str: @@ -103,26 +130,31 @@ def modify(self) -> datetime: class BaseTabularFile(BaseLocalFile, ABC): + """Abstract base for a local tabular file (e.g. CSV, Parquet). + + Subclasses must implement *columns*, *rows*, *load*, and *stream*. + """ + @property @abstractmethod def columns(self) -> list[str]: - pass + """Return the list of column names.""" @property @abstractmethod def rows(self) -> int: - pass + """Return the number of data rows.""" @abstractmethod async def load(self) -> pd.DataFrame: - pass + """Load the entire file into a pandas DataFrame.""" @abstractmethod def stream( self, chunk_size: int = 10000, ) -> AsyncGenerator[pd.DataFrame, None]: - pass + """Yield pandas DataFrames in chunks as an async generator.""" async def to_parquet( self, @@ -130,6 +162,13 @@ async def to_parquet( chunk_size: int = 10000, callback: Callable[[int, int], None] | None = None, ) -> Parquet: + """Convert the file to Parquet format. + + *output_path* is the destination path; defaults to the source path + with a .parquet extension. *chunk_size* controls the streaming chunk + size. *callback* receives (current_rows, total_rows) after each chunk. + Return the resulting Parquet wrapper object. + """ from pysus.api.extensions import ExtensionFactory, Parquet if output_path is None: @@ -187,25 +226,31 @@ async def to_parquet( class BaseCompressedFile(BaseLocalFile, ABC): + """Abstract base for a compressed archive file (e.g. .zip, .gz). + + Subclasses must implement *list_members*, *open_member*, and *extract*. + """ + @abstractmethod async def list_members(self) -> list[str]: - pass + """Return the list of member names inside the archive.""" @abstractmethod async def open_member(self, member_name: str) -> Any: - pass + """Open and return a single archive member by name.""" @abstractmethod async def extract( self, target_dir: Path = CACHEPATH, ) -> list[BaseLocalFile]: - pass + """Extract all members into *target_dir* and return the file objects.""" async def stream( self, chunk_size: int | None = None, ) -> AsyncGenerator[Any, None]: + """Yield each archive member as it is opened.""" members = await self.list_members() for member in members: yield await self.open_member(member) @@ -213,7 +258,10 @@ async def stream( class SearchableMixin: + """Mixin providing attribute-based filtering for remote objects.""" + def _matches(self, obj: Any, **kwargs) -> bool: + """Return True if all *kwargs* attributes match on *obj*.""" for key, value in kwargs.items(): obj_value = getattr(obj, key, None) if obj_value != value: @@ -222,6 +270,12 @@ def _matches(self, obj: Any, **kwargs) -> bool: class BaseRemoteFile(BaseFile, SearchableMixin, ABC): + """Abstract base for a file stored on a remote server. + + Subclasses must implement *_download*. *dataset* and *group* link back + to the containing objects. + """ + dataset: BaseRemoteDataset = Field(exclude=True) group: BaseRemoteGroup | None = Field(default=None, exclude=True) @@ -231,18 +285,22 @@ def name(self) -> str: @property def client(self) -> BaseRemoteClient: + """Return the remote client associated with this file.""" return self.dataset.client @property def year(self) -> int | None: + """Return the year associated with the file, or None.""" return None @property def month(self) -> int | None: + """Return the month associated with the file, or None.""" return None @property def state(self) -> State | None: + """Return the state associated with the file, or None.""" return None @abstractmethod @@ -251,13 +309,20 @@ async def _download( output: Path | None = None, callback: Callable[[int], None] | None = None, ) -> Path: - pass + """Download the file to *output* and return the local path. + + Subclasses implement the actual transfer logic. + """ async def download( self, output: str | Path | None = None, callback: Callable[[int], None] | None = None, ) -> BaseLocalFile: + """Download the remote file to a local cache or *output* path. + + Return the instantiated local file wrapper. + """ from pysus.api.extensions import ExtensionFactory if output is None: @@ -279,6 +344,11 @@ async def download( class BaseRemoteObject(BaseModel, ABC): + """Abstract base for a named remote entity with a description. + + Subclasses must implement *name*, *long_name*, and *description*. + """ + model_config = ConfigDict(arbitrary_types_allowed=True) def __str__(self) -> str: @@ -287,38 +357,49 @@ def __str__(self) -> str: @property @abstractmethod def name(self) -> str: - pass + """Return the short name of the remote entity.""" @property @abstractmethod def long_name(self) -> str: - pass + """Return the long / human-readable name.""" @property @abstractmethod def description(self) -> str: - pass + """Return a textual description of the entity.""" class BaseRemoteGroup(BaseRemoteObject, SearchableMixin, ABC): + """Abstract base for a named group of remote files within a dataset. + + Subclasses must implement *_fetch_files*. + """ + dataset: BaseRemoteDataset = Field(exclude=True) _files: list[BaseRemoteFile] | None = PrivateAttr(default=None) @property def parent(self) -> BaseRemoteDataset: + """Return the parent dataset.""" return self.dataset @abstractmethod async def _fetch_files(self) -> list[BaseRemoteFile]: - pass + """Fetch and return the list of files in this group.""" @property async def files(self) -> list[BaseRemoteFile]: + """Return all files in this group, fetching them on first access.""" if self._files is None: self._files = await self._fetch_files() return self._files async def search(self, **kwargs) -> list[BaseRemoteFile]: + """Filter files in this group by attribute *kwargs*. + + Return matching file objects. + """ all_files = await self.files if not kwargs: return all_files @@ -326,6 +407,11 @@ async def search(self, **kwargs) -> list[BaseRemoteFile]: class BaseRemoteDataset(BaseRemoteObject, SearchableMixin, ABC): + """Abstract base for a dataset containing groups and/or files. + + Subclasses must implement *_fetch_content*. + """ + client: BaseRemoteClient = Field(exclude=True) group_definitions: dict[str, str] = {} _content: Sequence[BaseRemoteGroup | BaseRemoteFile] | None = PrivateAttr( @@ -336,18 +422,23 @@ class BaseRemoteDataset(BaseRemoteObject, SearchableMixin, ABC): async def _fetch_content( self, ) -> Sequence[BaseRemoteGroup | BaseRemoteFile]: - pass + """Fetch and return the top-level content (groups and files).""" @property async def content( self, ) -> Sequence[BaseRemoteGroup | BaseRemoteFile]: + """Return the dataset content, fetching on first access.""" if self._content is None: self._content = await self._fetch_content() return self._content async def search(self, **kwargs) -> list[BaseRemoteFile]: + """Recursively search groups and files by attribute *kwargs*. + + Return matching file objects. + """ contents = await self.content matches = [] @@ -363,21 +454,27 @@ async def search(self, **kwargs) -> list[BaseRemoteFile]: class BaseRemoteClient(BaseRemoteObject, ABC): + """Abstract base for a remote API client (e.g. FTP, HTTP). + + Subclasses must implement *connect*, *close*, *login*, *datasets*, and + *_download_file*. + """ + @abstractmethod async def connect(self) -> None: - pass + """Establish a connection to the remote server.""" @abstractmethod async def close(self) -> None: - pass + """Close the connection to the remote server.""" @abstractmethod async def login(self, **kwargs) -> None: - pass + """Authenticate with the remote server using *kwargs* credentials.""" @abstractmethod async def datasets(self, **kwargs) -> list: - pass + """Return a list of available datasets matching *kwargs*.""" @abstractmethod async def _download_file( @@ -386,4 +483,4 @@ async def _download_file( output: Path, callback: Callable[[int], None] | None = None, ) -> Path: - pass + """Download a single *file* to *output* and return the local path.""" diff --git a/pysus/api/types.py b/pysus/api/types.py index 0f78d208..2ed8f95e 100644 --- a/pysus/api/types.py +++ b/pysus/api/types.py @@ -1,3 +1,13 @@ +"""Type aliases used across the PySUS API. + +FileType: + Discriminated union of supported local file types + (FILE, DIR, PARQUET, CSV, JSON, PDF, DBC, DBF, ZIP). + +State: + Brazilian state abbreviations (AC, AL, AP, ..., DF). +""" + from typing import Literal FileType = Literal[