diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 930a3d62..a48c0331 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -7,6 +7,10 @@ on:
pull_request:
branches: [ main ]
+concurrency:
+ group: ci-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
build:
runs-on: ubuntu-latest
@@ -39,3 +43,55 @@ jobs:
run: |
poetry config pypi-token.pypi ${PYPI_TOKEN}
make release
+
+ docs:
+ runs-on: ubuntu-latest
+
+ defaults:
+ run:
+ shell: bash -l {0}
+
+ permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: conda-incubator/setup-miniconda@v3
+ with:
+ miniforge-version: latest
+ environment-file: conda/dev.yaml
+ channels: conda-forge,nodefaults
+ activate-environment: pysus
+ auto-update-conda: true
+ conda-solver: libmamba
+
+ - name: Install dependencies
+ run: |
+ pip install poetry poetry-plugin-export
+ poetry config virtualenvs.create false
+ poetry install --with docs --extras dbc
+
+ - name: Build docs
+ run: |
+ cd docs
+ make html
+
+ - name: Configure GitHub Pages
+ uses: actions/configure-pages@v5
+
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: docs/build/html
+
+ - name: Deploy to GitHub Pages
+ if: github.ref == 'refs/heads/main'
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/README.md b/README.md
index 9391b203..dd403dfc 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,16 @@ df = sih(state="SP", year=2024, month=[1, 2, 3])
df = cnes(state="SP", year=2024, month=1)
```
+### Listing the files
+
+You can also list the files within the dataset to check which files are available to download
+
+```python
+from pysus import list_files
+
+list_files("SINAN")
+```
+
### Using the PySUS Client
```python
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 1dcfb82a..f2df05cd 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,4 @@
nbsphinx
sphinx
sphinx-rtd-theme
+standard-imghdr
diff --git a/docs/source/api.rst b/docs/source/api.rst
new file mode 100644
index 00000000..b4a507e4
--- /dev/null
+++ b/docs/source/api.rst
@@ -0,0 +1,137 @@
+API Reference
+=============
+
+The ``pysus.api`` package provides a layered architecture for discovering,
+downloading, and reading data from Brazilian public health databases
+(DATASUS). It supports three remote data sources.
+
+Architecture Overview
+---------------------
+
+The package is organized into a hierarchy of abstract base classes and
+concrete implementations::
+
+ pysus/api/
+ ├── __init__.py # Package entry (re-exports PySUS)
+ ├── client.py # Main PySUS orchestrator
+ ├── extensions.py # File format handlers
+ ├── models.py # Abstract base classes
+ ├── types.py # Type aliases
+ ├── _impl/
+ │ └── databases.py # High-level convenience functions
+ ├── ducklake/ # S3 DuckLake catalog client
+ ├── ftp/ # FTP client
+ └── dadosgov/ # dados.gov.br API client
+
+Quick Start
+-----------
+
+The simplest way to use PySUS is via the high-level convenience
+functions::
+
+ from pysus import sinan
+
+ df = sinan(disease="dengue", year=2023)
+
+Or with the async API::
+
+ from pysus.api.client import PySUS
+
+ async with PySUS() as pysus:
+ files = await pysus.query(dataset="sinan", group="DENG", year=2023)
+ for f in files:
+ await pysus.download(f)
+
+
+Main Client
+-----------
+
+.. automodule:: pysus.api.client
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Types
+-----
+
+.. automodule:: pysus.api.types
+ :members:
+ :undoc-members:
+
+File Format Handlers
+--------------------
+
+.. automodule:: pysus.api.extensions
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Abstract Base Models
+--------------------
+
+.. automodule:: pysus.api.models
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+High-Level Data Functions
+-------------------------
+
+.. automodule:: pysus.api._impl.databases
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+DuckLake Client
+---------------
+
+.. automodule:: pysus.api.ducklake.client
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.ducklake.catalog
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.ducklake.models
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+FTP Client
+----------
+
+.. automodule:: pysus.api.ftp.client
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.ftp.databases
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.ftp.models
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+DadosGov Client
+---------------
+
+.. automodule:: pysus.api.dadosgov.client
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.dadosgov.databases
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+.. automodule:: pysus.api.dadosgov.models
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 006a18d1..61426cec 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -12,13 +12,10 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath("../.."))
# -- General configuration ------------------------------------------------
@@ -33,9 +30,14 @@
"sphinx.ext.autodoc",
"sphinx.ext.mathjax",
"sphinx.ext.viewcode",
+ "sphinx.ext.intersphinx",
"nbsphinx",
]
+intersphinx_mapping = {
+ "sqlalchemy": ("https://docs.sqlalchemy.org/en/20/", None),
+}
+
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
diff --git a/docs/source/databases/CNES.ipynb b/docs/source/databases/CNES.ipynb
deleted file mode 100644
index 2a00576f..00000000
--- a/docs/source/databases/CNES.ipynb
+++ /dev/null
@@ -1,895 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "tFRs4aDTsdZb"
- },
- "source": [
- "# CNES FTP Database\n",
- "\n",
- "Code to work with CNES (Cadastro Nacional de Estabelecimentos de Saúde) directories and files inside DATASUS FTP\n",
- "\r\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pysus import CNES\n",
- "cnes = CNES()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Cadastro Nacional de Estabelecimentos de Saúde',\n",
- " 'source': 'https://cnes.datasus.gov.br/',\n",
- " 'description': 'O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o sistema de informação oficial de cadastramento de informações de todos os estabelecimentos de saúde no país, independentemente de sua natureza jurídica ou de integrarem o Sistema Único de Saúde (SUS). Trata-se do cadastro oficial do Ministério da Saúde (MS) no tocante à realidade da capacidade instalada e mão-de-obra assistencial de saúde no Brasil em estabelecimentos de saúde públicos ou privados, com convênio SUS ou não.'}"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cnes.metadata"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "CNES FTP Database has lazy loading (also applied to Directories), therefore its content will require explict `load()` to be displayed:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2023-09-19 11:57:40.905 | INFO | pysus.ftp:content:440 - content is not loaded, use `load()` to load default paths\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[]"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cnes.content"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "CNES - Cadastro Nacional de Estabelecimentos de Saúde"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cnes.load() # Loads default CNES content (from cnes.paths)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[/dissemin/publicos/CNES/200508_/Dados/DC,\n",
- " /dissemin/publicos/CNES/200508_/Dados/EE,\n",
- " /dissemin/publicos/CNES/200508_/Dados/EF,\n",
- " /dissemin/publicos/CNES/200508_/Dados/EP,\n",
- " /dissemin/publicos/CNES/200508_/Dados/EQ,\n",
- " /dissemin/publicos/CNES/200508_/Dados/GM,\n",
- " /dissemin/publicos/CNES/200508_/Dados/HB,\n",
- " /dissemin/publicos/CNES/200508_/Dados/IN,\n",
- " /dissemin/publicos/CNES/200508_/Dados/LT,\n",
- " /dissemin/publicos/CNES/200508_/Dados/PF,\n",
- " /dissemin/publicos/CNES/200508_/Dados/RC,\n",
- " /dissemin/publicos/CNES/200508_/Dados/SR,\n",
- " /dissemin/publicos/CNES/200508_/Dados/ST]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Default content for (loaded) CNES database \n",
- "cnes.content"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## CNES Groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'DC': 'Dados Complementares',\n",
- " 'EE': 'Estabelecimento de Ensino',\n",
- " 'EF': 'Estabelecimento Filantrópico',\n",
- " 'EP': 'Equipes',\n",
- " 'EQ': 'Equipamentos',\n",
- " 'GM': 'Gestão e Metas',\n",
- " 'HB': 'Habilitação',\n",
- " 'IN': 'Incentivos',\n",
- " 'LT': 'Leitos',\n",
- " 'PF': 'Profissional',\n",
- " 'RC': 'Regra Contratual',\n",
- " 'SR': 'Serviço Especializado',\n",
- " 'ST': 'Estabelecimentos'}"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cnes.groups"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Loading groups\n",
- "\n",
- "To load specific groups into `cnes` content, it's possible to pass them in the `load()` method:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "17838 files loaded\n"
- ]
- }
- ],
- "source": [
- "cnes.load(\"DC\")\n",
- "cnes.load([\"ST\", \"SR\"])\n",
- "print(str(len(cnes.content)) + \" files loaded\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Filtering files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### by group (required)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "5940"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Get files by group\n",
- "len(cnes.get_files(\"SR\"))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "11887"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Multiple groups\n",
- "len(cnes.get_files([\"ST\", \"SR\"]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### by UF (state)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "SR - São Paulo files: 220\n"
- ]
- }
- ],
- "source": [
- "# Get files by UF from group\n",
- "print(\"SR - São Paulo files: \" + str(len( cnes.get_files(\"SR\", uf=\"SP\" ))))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "440"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Multiple UFs\n",
- "len(cnes.get_files(\"SR\", uf=[\"SP\", \"RJ\"]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### by year"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "216"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(cnes.get_files(\"SR\", year=2023)) # or 23"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1243"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Multiple Years\n",
- "len(cnes.get_files(\"SR\", year=[20, 21, 22, 23]))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### by month"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1945"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(cnes.get_files(\"SR\", month=[1, 2, 3, 4])) # or single month"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Combining filters"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[SRSP2301.dbc, SRSP2302.dbc, SRSP2303.dbc]"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "files = cnes.get_files(\"SR\", uf=\"SP\", year=2023, month=[1,2,3])\n",
- "files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Describing Files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'group': 'Serviço Especializado',\n",
- " 'last_update': '2023-02-17 07:31AM',\n",
- " 'month': 'Janeiro',\n",
- " 'name': 'SRSP2301.dbc',\n",
- " 'size': '1.6 MB',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 2023}\n",
- "{'group': 'Serviço Especializado',\n",
- " 'last_update': '2023-03-14 02:34PM',\n",
- " 'month': 'Fevereiro',\n",
- " 'name': 'SRSP2302.dbc',\n",
- " 'size': '1.6 MB',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 2023}\n",
- "{'group': 'Serviço Especializado',\n",
- " 'last_update': '2023-04-17 07:34AM',\n",
- " 'month': 'Março',\n",
- " 'name': 'SRSP2303.dbc',\n",
- " 'size': '1.6 MB',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 2023}\n"
- ]
- }
- ],
- "source": [
- "from pprint import pprint\n",
- "for file in files:\n",
- " pprint(cnes.describe(file))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Downloading Files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "SRSP2303.parquet: 100%|██████████| 81.4k/81.4k [00:05<00:00, 14.8kB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/SRSP2301.parquet,\n",
- " /home/bida/pysus/SRSP2302.parquet,\n",
- " /home/bida/pysus/SRSP2303.parquet]"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquets = cnes.download(files)\n",
- "parquets"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " CNES | \n",
- " CODUFMUN | \n",
- " SERV_ESP | \n",
- " CLASS_SR | \n",
- " SRVUNICO | \n",
- " REGSAUDE | \n",
- " MICR_REG | \n",
- " DISTRSAN | \n",
- " DISTRADM | \n",
- " TPGESTAO | \n",
- " ... | \n",
- " CNPJ_MAN | \n",
- " CARACTER | \n",
- " AMB_NSUS | \n",
- " AMB_SUS | \n",
- " HOSP_NSUS | \n",
- " HOSP_SUS | \n",
- " COMPETEN | \n",
- " CONTSRVU | \n",
- " CNESTERC | \n",
- " NAT_JUR | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 2790920 | \n",
- " 351450 | \n",
- " 145 | \n",
- " 008 | \n",
- " | \n",
- " 0206 | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 00000000000000 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 2062 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2790920 | \n",
- " 351450 | \n",
- " 145 | \n",
- " 009 | \n",
- " | \n",
- " 0206 | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 00000000000000 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 2062 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2790920 | \n",
- " 351450 | \n",
- " 145 | \n",
- " 010 | \n",
- " | \n",
- " 0206 | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 00000000000000 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 2062 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2790920 | \n",
- " 351450 | \n",
- " 145 | \n",
- " 013 | \n",
- " | \n",
- " 0206 | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 00000000000000 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 2062 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3025489 | \n",
- " 351450 | \n",
- " 111 | \n",
- " 001 | \n",
- " 111 | \n",
- " 0206 | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46137485000160 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " 1 | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 144830 | \n",
- " 0008885 | \n",
- " 354780 | \n",
- " 141 | \n",
- " 002 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46522942000130 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- " | 144831 | \n",
- " 0008885 | \n",
- " 354780 | \n",
- " 144 | \n",
- " 001 | \n",
- " 144 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46522942000130 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " 1 | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- " | 144832 | \n",
- " 0008885 | \n",
- " 354780 | \n",
- " 159 | \n",
- " 001 | \n",
- " 159 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46522942000130 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " 1 | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- " | 144833 | \n",
- " 0008885 | \n",
- " 354780 | \n",
- " 159 | \n",
- " 004 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46522942000130 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- " | 144834 | \n",
- " 0008885 | \n",
- " 354780 | \n",
- " 159 | \n",
- " 005 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " M | \n",
- " ... | \n",
- " 46522942000130 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 202301 | \n",
- " | \n",
- " | \n",
- " 1244 | \n",
- "
\n",
- " \n",
- "
\n",
- "
144835 rows × 32 columns
\n",
- "
"
- ],
- "text/plain": [
- " CNES CODUFMUN SERV_ESP CLASS_SR SRVUNICO REGSAUDE MICR_REG \\\n",
- "0 2790920 351450 145 008 0206 \n",
- "1 2790920 351450 145 009 0206 \n",
- "2 2790920 351450 145 010 0206 \n",
- "3 2790920 351450 145 013 0206 \n",
- "4 3025489 351450 111 001 111 0206 \n",
- "... ... ... ... ... ... ... ... \n",
- "144830 0008885 354780 141 002 \n",
- "144831 0008885 354780 144 001 144 \n",
- "144832 0008885 354780 159 001 159 \n",
- "144833 0008885 354780 159 004 \n",
- "144834 0008885 354780 159 005 \n",
- "\n",
- " DISTRSAN DISTRADM TPGESTAO ... CNPJ_MAN CARACTER AMB_NSUS \\\n",
- "0 M ... 00000000000000 1 1 \n",
- "1 M ... 00000000000000 1 1 \n",
- "2 M ... 00000000000000 1 1 \n",
- "3 M ... 00000000000000 1 1 \n",
- "4 M ... 46137485000160 1 0 \n",
- "... ... ... ... ... ... ... ... \n",
- "144830 M ... 46522942000130 1 0 \n",
- "144831 M ... 46522942000130 1 0 \n",
- "144832 M ... 46522942000130 1 0 \n",
- "144833 M ... 46522942000130 1 0 \n",
- "144834 M ... 46522942000130 1 0 \n",
- "\n",
- " AMB_SUS HOSP_NSUS HOSP_SUS COMPETEN CONTSRVU CNESTERC NAT_JUR \n",
- "0 0 0 0 202301 2062 \n",
- "1 0 0 0 202301 2062 \n",
- "2 0 0 0 202301 2062 \n",
- "3 0 0 0 202301 2062 \n",
- "4 1 0 0 202301 1 1244 \n",
- "... ... ... ... ... ... ... ... \n",
- "144830 1 0 0 202301 1244 \n",
- "144831 1 0 0 202301 1 1244 \n",
- "144832 1 0 0 202301 1 1244 \n",
- "144833 1 0 0 202301 1244 \n",
- "144834 1 0 0 202301 1244 \n",
- "\n",
- "[144835 rows x 32 columns]"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquets[0].to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "name": "Getting CNES Data.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/source/databases/SIA.ipynb b/docs/source/databases/SIA.ipynb
deleted file mode 100644
index d201580a..00000000
--- a/docs/source/databases/SIA.ipynb
+++ /dev/null
@@ -1,694 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a73920ee-3902-4270-a17d-b7907d8561d7",
- "metadata": {},
- "source": [
- "# SIA FTP Database\n",
- "##### Sistema de Informações Ambulatoriais"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "8d619b0d-300c-4bc2-a738-bf96d650311d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from pysus import SIA\n",
- "sia = SIA().load() # Loads the files from DATASUS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "4d50b674-b5bd-4ec5-a812-15c680841879",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SIA - Sistema de Informações Ambulatoriais"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "5a74172b-841e-4ba7-bcc6-41ec9a216423",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Sistema de Informações Ambulatoriais',\n",
- " 'source': 'http://sia.datasus.gov.br/principal/index.php',\n",
- " 'description': 'O Sistema de Informação Ambulatorial (SIA) foi instituído pela Portaria GM/MS n.º 896 de 29 de junho de 1990. Originalmente, o SIA foi concebido a partir do projeto SICAPS (Sistema de Informação e Controle Ambulatorial da Previdência Social), em que os conceitos, os objetivos e as diretrizes criados para o desenvolvimento do SICAPS foram extremamente importantes e amplamente utilizados para o desenvolvimento do SIA, tais como: (i) o acompanhamento das programações físicas e orçamentárias; (ii) o acompanhamento das ações de saúde produzidas; (iii) a agilização do pagamento e controle orçamentário e financeiro; e (iv) a formação de banco de dados para contribuir com a construção do SUS.'}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia.metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "10e69ba8-baa1-4718-b53e-40af23084324",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'AB': 'APAC de Cirurgia Bariátrica',\n",
- " 'ABO': 'APAC de Acompanhamento Pós Cirurgia Bariátrica',\n",
- " 'ACF': 'APAC de Confecção de Fístula',\n",
- " 'AD': 'APAC de Laudos Diversos',\n",
- " 'AM': 'APAC de Medicamentos',\n",
- " 'AMP': 'APAC de Acompanhamento Multiprofissional',\n",
- " 'AN': 'APAC de Nefrologia',\n",
- " 'AQ': 'APAC de Quimioterapia',\n",
- " 'AR': 'APAC de Radioterapia',\n",
- " 'ATD': 'APAC de Tratamento Dialítico',\n",
- " 'BI': 'Boletim de Produção Ambulatorial individualizado',\n",
- " 'IMPBO': '',\n",
- " 'PA': 'Produção Ambulatorial',\n",
- " 'PAM': '',\n",
- " 'PAR': '',\n",
- " 'PAS': '',\n",
- " 'PS': 'RAAS Psicossocial',\n",
- " 'SAD': 'RAAS de Atenção Domiciliar'}"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia.groups"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ff0b298c-69cf-4884-b7c8-2936de2c3508",
- "metadata": {},
- "source": [
- "### Getting specific files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "3910f87e-965b-4a5e-8fb2-9c9ad257d0f7",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "14300 files\n"
- ]
- }
- ],
- "source": [
- "print(str(len(sia.get_files([\"PA\", \"BI\"]))) + \" files\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "c3cd4449-55fd-418d-a6e0-53ff38cd9258",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[PASP0001.dbc,\n",
- " PASP0002.dbc,\n",
- " PASP0004.dbc,\n",
- " PASP0005.dbc,\n",
- " Pasp0006.dbc,\n",
- " pasp0003.dbc,\n",
- " pasp0007.dbc,\n",
- " pasp0008.dbc,\n",
- " pasp0009.dbc,\n",
- " pasp0010.dbc,\n",
- " pasp0011.dbc,\n",
- " pasp0012.dbc]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia.get_files(\"PA\", uf=\"SP\", year=2000)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "3883f518-c21e-4334-b18e-7fd9127aa83f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[PASP0001.dbc, PASP0002.dbc]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "files = sia.get_files(\"PA\", uf=\"SP\", year=2000, month=[1,2])\n",
- "files"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c065668a-0157-40cb-8662-0daf9db2937b",
- "metadata": {},
- "source": [
- "### Describing files inside DATASUS server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "017d4445-f172-4376-ba85-19c06f9d1de4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'PASP0001.dbc',\n",
- " 'group': 'Produção Ambulatorial',\n",
- " 'uf': 'São Paulo',\n",
- " 'month': 'Janeiro',\n",
- " 'year': 2000,\n",
- " 'size': '7.2 MB',\n",
- " 'last_update': '2013-10-24 04:18PM'}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia.describe(files[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "59039f48-9c9b-4807-81f6-24d2139a70b8",
- "metadata": {},
- "source": [
- "### Downloading files"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dc671449-d4d3-4e32-9afd-68f0d844a104",
- "metadata": {},
- "source": [
- "You can rather download multiple files or download them individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "8f53a375-266b-4290-8705-408d236fd6d1",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "PASP0002.parquet: 100%|████████████| 447k/447k [00:20<00:00, 21.6kB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/PASP0001.parquet, /home/bida/pysus/PASP0002.parquet]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sia.download(files) # or specify a directory with `local_dir=` "
- ]
- },
- {
- "cell_type": "markdown",
- "id": "70e10d61-d2a7-49ed-a409-38175321df04",
- "metadata": {},
- "source": [
- "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. "
- ]
- },
- {
- "cell_type": "markdown",
- "id": "886389c2-5c26-43c2-9820-0c3fa9d85021",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "4d2ddb3e-0d04-4b3c-952f-12ddd658751b",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████| 7.73M/7.73M [00:00<00:00, 4.84GB/s]\n"
- ]
- }
- ],
- "source": [
- "parquet = sia.download(files)[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "0ef3fb2b-af49-4744-9692-410c4c4820b1",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " PA_CONDIC | \n",
- " PA_GESTAO | \n",
- " PA_CODUNI | \n",
- " PA_DATREF | \n",
- " PA_CODPRO | \n",
- " PA_DOCORIG | \n",
- " PA_CODESP | \n",
- " PA_TIPPRO | \n",
- " PA_TIPATE | \n",
- " PA_FXETAR | \n",
- " ... | \n",
- " PA_DATPR | \n",
- " PA_VALPRO | \n",
- " PA_VALAPR | \n",
- " PA_UFMUN | \n",
- " PA_MUNAT | \n",
- " PA_NUMAPA | \n",
- " PA_CODOCO | \n",
- " PA_CIDPRI | \n",
- " PA_CIDSEC | \n",
- " PA_MORFOL | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " EC | \n",
- " 359999 | \n",
- " 000867 | \n",
- " 200001 | \n",
- " 0701223 | \n",
- " B | \n",
- " 27 | \n",
- " 02 | \n",
- " 99 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 58.65 | \n",
- " 58.65 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " EC | \n",
- " 359999 | \n",
- " 000867 | \n",
- " 200001 | \n",
- " 0701223 | \n",
- " B | \n",
- " 27 | \n",
- " 02 | \n",
- " 99 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 15.30 | \n",
- " 15.30 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " EC | \n",
- " 359999 | \n",
- " 000867 | \n",
- " 200001 | \n",
- " 0701223 | \n",
- " B | \n",
- " 27 | \n",
- " 02 | \n",
- " 99 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 12.75 | \n",
- " 12.75 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " EC | \n",
- " 359999 | \n",
- " 000867 | \n",
- " 200001 | \n",
- " 0701223 | \n",
- " B | \n",
- " 27 | \n",
- " 02 | \n",
- " 99 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 12.75 | \n",
- " 12.75 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " EC | \n",
- " 359999 | \n",
- " 000867 | \n",
- " 200001 | \n",
- " 0701223 | \n",
- " B | \n",
- " 27 | \n",
- " 02 | \n",
- " 99 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 5.10 | \n",
- " 5.10 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 725999 | \n",
- " EA | \n",
- " 359999 | \n",
- " 016127 | \n",
- " 200001 | \n",
- " 0102302 | \n",
- " B | \n",
- " 77 | \n",
- " 00 | \n",
- " 00 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 3972.00 | \n",
- " 3972.00 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 726000 | \n",
- " EA | \n",
- " 359999 | \n",
- " 016127 | \n",
- " 200001 | \n",
- " 0302206 | \n",
- " B | \n",
- " 30 | \n",
- " 00 | \n",
- " 00 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 2.48 | \n",
- " 2.48 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 726001 | \n",
- " EA | \n",
- " 359999 | \n",
- " 016127 | \n",
- " 200001 | \n",
- " 0302207 | \n",
- " B | \n",
- " 30 | \n",
- " 00 | \n",
- " 00 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 6.39 | \n",
- " 6.39 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 726002 | \n",
- " EA | \n",
- " 359999 | \n",
- " 016127 | \n",
- " 200001 | \n",
- " 0304103 | \n",
- " B | \n",
- " 30 | \n",
- " 00 | \n",
- " 00 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 7.92 | \n",
- " 7.92 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 726003 | \n",
- " EA | \n",
- " 359999 | \n",
- " 016127 | \n",
- " 200001 | \n",
- " 0401103 | \n",
- " B | \n",
- " 01 | \n",
- " 00 | \n",
- " 27 | \n",
- " | \n",
- " ... | \n",
- " 200001 | \n",
- " 2.00 | \n",
- " 2.00 | \n",
- " 355030 | \n",
- " 355030 | \n",
- " 00000000000 | \n",
- " S01 | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- "
\n",
- "
726004 rows × 24 columns
\n",
- "
"
- ],
- "text/plain": [
- " PA_CONDIC PA_GESTAO PA_CODUNI PA_DATREF PA_CODPRO PA_DOCORIG PA_CODESP \\\n",
- "0 EC 359999 000867 200001 0701223 B 27 \n",
- "1 EC 359999 000867 200001 0701223 B 27 \n",
- "2 EC 359999 000867 200001 0701223 B 27 \n",
- "3 EC 359999 000867 200001 0701223 B 27 \n",
- "4 EC 359999 000867 200001 0701223 B 27 \n",
- "... ... ... ... ... ... ... ... \n",
- "725999 EA 359999 016127 200001 0102302 B 77 \n",
- "726000 EA 359999 016127 200001 0302206 B 30 \n",
- "726001 EA 359999 016127 200001 0302207 B 30 \n",
- "726002 EA 359999 016127 200001 0304103 B 30 \n",
- "726003 EA 359999 016127 200001 0401103 B 01 \n",
- "\n",
- " PA_TIPPRO PA_TIPATE PA_FXETAR ... PA_DATPR PA_VALPRO PA_VALAPR \\\n",
- "0 02 99 ... 200001 58.65 58.65 \n",
- "1 02 99 ... 200001 15.30 15.30 \n",
- "2 02 99 ... 200001 12.75 12.75 \n",
- "3 02 99 ... 200001 12.75 12.75 \n",
- "4 02 99 ... 200001 5.10 5.10 \n",
- "... ... ... ... ... ... ... ... \n",
- "725999 00 00 ... 200001 3972.00 3972.00 \n",
- "726000 00 00 ... 200001 2.48 2.48 \n",
- "726001 00 00 ... 200001 6.39 6.39 \n",
- "726002 00 00 ... 200001 7.92 7.92 \n",
- "726003 00 27 ... 200001 2.00 2.00 \n",
- "\n",
- " PA_UFMUN PA_MUNAT PA_NUMAPA PA_CODOCO PA_CIDPRI PA_CIDSEC PA_MORFOL \n",
- "0 355030 355030 00000000000 S01 \n",
- "1 355030 355030 00000000000 S01 \n",
- "2 355030 355030 00000000000 S01 \n",
- "3 355030 355030 00000000000 S01 \n",
- "4 355030 355030 00000000000 S01 \n",
- "... ... ... ... ... ... ... ... \n",
- "725999 355030 355030 00000000000 S01 \n",
- "726000 355030 355030 00000000000 S01 \n",
- "726001 355030 355030 00000000000 S01 \n",
- "726002 355030 355030 00000000000 S01 \n",
- "726003 355030 355030 00000000000 S01 \n",
- "\n",
- "[726004 rows x 24 columns]"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet.to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/source/databases/SIH.ipynb b/docs/source/databases/SIH.ipynb
deleted file mode 100644
index c86615cb..00000000
--- a/docs/source/databases/SIH.ipynb
+++ /dev/null
@@ -1,685 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "a73920ee-3902-4270-a17d-b7907d8561d7",
- "metadata": {},
- "source": [
- "# SIH FTP Database\n",
- "##### Sistema de Informações Hospitalares"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "8d619b0d-300c-4bc2-a738-bf96d650311d",
- "metadata": {},
- "outputs": [],
- "source": [
- "from pysus import SIH\n",
- "sih = SIH().load() # Loads the files from DATASUS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "4d50b674-b5bd-4ec5-a812-15c680841879",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SIH - Sistema de Informações Hospitalares"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "5a74172b-841e-4ba7-bcc6-41ec9a216423",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Sistema de Informações Hospitalares',\n",
- " 'source': ('https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/',\n",
- " 'https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/'),\n",
- " 'description': 'A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os atendimentos que provenientes de internações hospitalares que foram financiadas pelo SUS, e após o processamento, gerarem relatórios para os gestores que lhes possibilitem fazer os pagamentos dos estabelecimentos de saúde. Além disso, o nível Federal recebe mensalmente uma base de dados de todas as internações autorizadas (aprovadas ou não para pagamento) para que possam ser repassados às Secretarias de Saúde os valores de Produção de Média e Alta complexidade além dos valores de CNRAC, FAEC e de Hospitais Universitários – em suas variadas formas de contrato de gestão.'}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih.metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "10e69ba8-baa1-4718-b53e-40af23084324",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'RD': 'AIH Reduzida',\n",
- " 'RJ': 'AIH Rejeitada',\n",
- " 'ER': 'AIH Rejeitada com erro',\n",
- " 'SP': 'Serviços Profissionais',\n",
- " 'CH': 'Cadastro Hospitalar',\n",
- " 'CM': ''}"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih.groups"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ff0b298c-69cf-4884-b7c8-2936de2c3508",
- "metadata": {},
- "source": [
- "### Getting specific files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "3910f87e-965b-4a5e-8fb2-9c9ad257d0f7",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "15649 files\n"
- ]
- }
- ],
- "source": [
- "print(str(len(sih.get_files([\"RD\", \"RJ\"]))) + \" files\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "c3cd4449-55fd-418d-a6e0-53ff38cd9258",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[RDSP0001.dbc,\n",
- " RDSP0002.dbc,\n",
- " RDSP0003.dbc,\n",
- " RDSP0004.dbc,\n",
- " RDSP0005.dbc,\n",
- " RDSP0006.dbc,\n",
- " RDSP0007.dbc,\n",
- " RDSP0008.dbc,\n",
- " RDSP0009.dbc,\n",
- " RDSP0010.dbc,\n",
- " RDSP0011.dbc,\n",
- " RDSP0012.dbc]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih.get_files(\"RD\", uf=\"SP\", year=2000)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "3883f518-c21e-4334-b18e-7fd9127aa83f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[RDSP0001.dbc, RDSP0002.dbc, RDSP0003.dbc]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "files = sih.get_files(\"RD\", uf=\"SP\", year=2000, month=[1,2,3])\n",
- "files"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c065668a-0157-40cb-8662-0daf9db2937b",
- "metadata": {},
- "source": [
- "### Describing files inside DATASUS server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "017d4445-f172-4376-ba85-19c06f9d1de4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'RDSP0001.dbc',\n",
- " 'group': 'AIH Reduzida',\n",
- " 'uf': 'São Paulo',\n",
- " 'month': 'Janeiro',\n",
- " 'year': 2000,\n",
- " 'size': '10.1 MB',\n",
- " 'last_update': '2013-10-31 01:14PM'}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih.describe(files[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "59039f48-9c9b-4807-81f6-24d2139a70b8",
- "metadata": {},
- "source": [
- "### Downloading files"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "dc671449-d4d3-4e32-9afd-68f0d844a104",
- "metadata": {},
- "source": [
- "You can rather download multiple files or download them individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "8f53a375-266b-4290-8705-408d236fd6d1",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "RDSP0003.parquet: 100%|████████████| 340k/340k [00:12<00:00, 28.0kB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/RDSP0001.parquet,\n",
- " /home/bida/pysus/RDSP0002.parquet,\n",
- " /home/bida/pysus/RDSP0003.parquet]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sih.download(files) # Specify a directory with `local_dir=`"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "70e10d61-d2a7-49ed-a409-38175321df04",
- "metadata": {},
- "source": [
- "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. "
- ]
- },
- {
- "cell_type": "markdown",
- "id": "886389c2-5c26-43c2-9820-0c3fa9d85021",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "5882ddc1-9dfb-4181-baed-2adb09bf66e8",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████| 8.46M/8.46M [00:00<00:00, 4.04GB/s]\n"
- ]
- }
- ],
- "source": [
- "parquet = sih.download(files)[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "70db53ae-7105-48d8-9a42-868385dc3982",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " UF_ZI | \n",
- " ANO_CMPT | \n",
- " MES_CMPT | \n",
- " ESPEC | \n",
- " CGC_HOSP | \n",
- " N_AIH | \n",
- " IDENT | \n",
- " CEP | \n",
- " MUNIC_RES | \n",
- " NASC | \n",
- " ... | \n",
- " CAR_INT | \n",
- " TOT_PT_SP | \n",
- " CPF_AUT | \n",
- " HOMONIMO | \n",
- " NUM_FILHOS | \n",
- " INSTRU | \n",
- " CID_NOTIF | \n",
- " CONTRACEP1 | \n",
- " CONTRACEP2 | \n",
- " GESTRISCO | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380830 | \n",
- " 1 | \n",
- " 06040090 | \n",
- " 353440 | \n",
- " 19631229 | \n",
- " ... | \n",
- " 05 | \n",
- " 719 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380841 | \n",
- " 1 | \n",
- " 06110000 | \n",
- " 353440 | \n",
- " 19620609 | \n",
- " ... | \n",
- " 05 | \n",
- " 176 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380852 | \n",
- " 1 | \n",
- " 06184080 | \n",
- " 353440 | \n",
- " 19781207 | \n",
- " ... | \n",
- " 05 | \n",
- " 36 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380863 | \n",
- " 1 | \n",
- " 06010020 | \n",
- " 353440 | \n",
- " 19710106 | \n",
- " ... | \n",
- " 05 | \n",
- " 176 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380874 | \n",
- " 1 | \n",
- " 06112010 | \n",
- " 353440 | \n",
- " 19710717 | \n",
- " ... | \n",
- " 05 | \n",
- " 158 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 193819 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380786 | \n",
- " 1 | \n",
- " 06036090 | \n",
- " 353440 | \n",
- " 19740804 | \n",
- " ... | \n",
- " 05 | \n",
- " 737 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 193820 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380797 | \n",
- " 1 | \n",
- " 06240120 | \n",
- " 353440 | \n",
- " 19670403 | \n",
- " ... | \n",
- " 05 | \n",
- " 737 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 193821 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380808 | \n",
- " 1 | \n",
- " 06140080 | \n",
- " 353440 | \n",
- " 19741001 | \n",
- " ... | \n",
- " 05 | \n",
- " 737 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 193822 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380819 | \n",
- " 1 | \n",
- " 06240090 | \n",
- " 353440 | \n",
- " 19721028 | \n",
- " ... | \n",
- " 05 | \n",
- " 719 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 193823 | \n",
- " 35 | \n",
- " 2000 | \n",
- " 01 | \n",
- " 02 | \n",
- " 46523171000287 | \n",
- " 2179380820 | \n",
- " 1 | \n",
- " 06260030 | \n",
- " 353440 | \n",
- " 19721122 | \n",
- " ... | \n",
- " 05 | \n",
- " 737 | \n",
- " | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
193824 rows × 60 columns
\n",
- "
"
- ],
- "text/plain": [
- " UF_ZI ANO_CMPT MES_CMPT ESPEC CGC_HOSP N_AIH IDENT \\\n",
- "0 35 2000 01 02 46523171000287 2179380830 1 \n",
- "1 35 2000 01 02 46523171000287 2179380841 1 \n",
- "2 35 2000 01 02 46523171000287 2179380852 1 \n",
- "3 35 2000 01 02 46523171000287 2179380863 1 \n",
- "4 35 2000 01 02 46523171000287 2179380874 1 \n",
- "... ... ... ... ... ... ... ... \n",
- "193819 35 2000 01 02 46523171000287 2179380786 1 \n",
- "193820 35 2000 01 02 46523171000287 2179380797 1 \n",
- "193821 35 2000 01 02 46523171000287 2179380808 1 \n",
- "193822 35 2000 01 02 46523171000287 2179380819 1 \n",
- "193823 35 2000 01 02 46523171000287 2179380820 1 \n",
- "\n",
- " CEP MUNIC_RES NASC ... CAR_INT TOT_PT_SP CPF_AUT HOMONIMO \\\n",
- "0 06040090 353440 19631229 ... 05 719 \n",
- "1 06110000 353440 19620609 ... 05 176 \n",
- "2 06184080 353440 19781207 ... 05 36 \n",
- "3 06010020 353440 19710106 ... 05 176 \n",
- "4 06112010 353440 19710717 ... 05 158 \n",
- "... ... ... ... ... ... ... ... ... \n",
- "193819 06036090 353440 19740804 ... 05 737 \n",
- "193820 06240120 353440 19670403 ... 05 737 \n",
- "193821 06140080 353440 19741001 ... 05 737 \n",
- "193822 06240090 353440 19721028 ... 05 719 \n",
- "193823 06260030 353440 19721122 ... 05 737 \n",
- "\n",
- " NUM_FILHOS INSTRU CID_NOTIF CONTRACEP1 CONTRACEP2 GESTRISCO \n",
- "0 0 0 00 00 0 \n",
- "1 0 0 00 00 0 \n",
- "2 0 0 00 00 0 \n",
- "3 0 0 00 00 0 \n",
- "4 0 0 00 00 0 \n",
- "... ... ... ... ... ... ... \n",
- "193819 0 0 00 00 0 \n",
- "193820 0 0 00 00 0 \n",
- "193821 0 0 00 00 0 \n",
- "193822 0 0 00 00 0 \n",
- "193823 0 0 00 00 0 \n",
- "\n",
- "[193824 rows x 60 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet.to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/source/databases/SIM.ipynb b/docs/source/databases/SIM.ipynb
deleted file mode 100644
index 84c00eda..00000000
--- a/docs/source/databases/SIM.ipynb
+++ /dev/null
@@ -1,705 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "# SIM FTP Database\n",
- "##### Sistema de Informação sobre Mortalidade\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "from pysus import SIM\n",
- "sim = SIM().load() # Loads the files from DATASUS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Sistema de Informação sobre Mortalidade',\n",
- " 'source': 'http://sim.saude.gov.br',\n",
- " 'description': ''}"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'CID10': 'DO', 'CID9': 'DOR'}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[/dissemin/publicos/SIM/CID10/DORES, /dissemin/publicos/SIM/CID9/DORES]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.paths"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "For more information about CID9 and CID10, visit http://tabnet.saude.es.gov.br/cgi/tabnet/sim/sim96/obtdescr.htm"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Getting specific files "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[DORSP95.DBC]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.get_files(\"CID9\", uf=\"SP\", year=1995)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[DORJ2019.dbc,\n",
- " DORJ2020.dbc,\n",
- " DORJ2021.dbc,\n",
- " DOSP2019.dbc,\n",
- " DOSP2020.dbc,\n",
- " DOSP2021.dbc]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.get_files(\"CID10\", uf=[\"SP\", \"RJ\"], year=[2019, 2020, 2021])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "files = sim.get_files([\"CID9\", \"CID10\"], uf=[\"SP\"], year=[1995, 2020])\n",
- "sp_cid9, sp_cid10 = files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Describing a file inside DATASUS server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'DORSP95.DBC',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 1995,\n",
- " 'group': 'CID9',\n",
- " 'size': '8.2 MB',\n",
- " 'last_update': '2020-01-31 02:48PM'}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.describe(sp_cid9)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'DOSP2020.dbc',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 2020,\n",
- " 'group': 'CID10',\n",
- " 'size': '28.7 MB',\n",
- " 'last_update': '2022-03-31 04:19PM'}"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.describe(sp_cid10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Downloading files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can rather download multiple files or download them individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DORSP95.parquet: 100%|█████████████| 434k/434k [00:12<00:00, 36.0kB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/DORSP95.parquet]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sim.download(sp_cid9) # Downloads to default directory"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "/home/bida/pysus/DORSP95.parquet"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet = sp_cid9.download() # Or in a custom directory with `local_dir=`\n",
- "parquet"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " contador | \n",
- " CARTORIO | \n",
- " REGISTRO | \n",
- " DATAREG | \n",
- " TIPOBITO | \n",
- " DATAOBITO | \n",
- " ESTCIVIL | \n",
- " SEXO | \n",
- " DATANASC | \n",
- " IDADE | \n",
- " ... | \n",
- " FONTINFO | \n",
- " ACIDTRAB | \n",
- " LOCACID | \n",
- " CRITICA | \n",
- " NUMEXPORT | \n",
- " CRSOCOR | \n",
- " CRSRES | \n",
- " RACACOR | \n",
- " ETNIA | \n",
- " UFINFORM | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 180001 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951002 | \n",
- " 2 | \n",
- " 1 | \n",
- " 19291003 | \n",
- " 465 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 180002 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951002 | \n",
- " 3 | \n",
- " 2 | \n",
- " 18980317 | \n",
- " 497 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 180003 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951003 | \n",
- " 2 | \n",
- " 2 | \n",
- " 19281002 | \n",
- " 467 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 180004 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951003 | \n",
- " 3 | \n",
- " 1 | \n",
- " 19110613 | \n",
- " 484 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 180005 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951004 | \n",
- " 1 | \n",
- " 1 | \n",
- " 19610914 | \n",
- " 434 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 227832 | \n",
- " 179996 | \n",
- " | \n",
- " | \n",
- " 951004 | \n",
- " 2 | \n",
- " 951001 | \n",
- " 4 | \n",
- " 1 | \n",
- " 19380423 | \n",
- " 457 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 227833 | \n",
- " 179997 | \n",
- " | \n",
- " | \n",
- " 951004 | \n",
- " 2 | \n",
- " 951001 | \n",
- " 2 | \n",
- " 1 | \n",
- " 19470130 | \n",
- " 448 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 227834 | \n",
- " 179998 | \n",
- " | \n",
- " | \n",
- " 951004 | \n",
- " 2 | \n",
- " 951001 | \n",
- " 3 | \n",
- " 2 | \n",
- " 19160113 | \n",
- " 479 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 227835 | \n",
- " 179999 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951001 | \n",
- " 1 | \n",
- " 1 | \n",
- " 19550901 | \n",
- " 440 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " | 227836 | \n",
- " 180000 | \n",
- " | \n",
- " | \n",
- " 951006 | \n",
- " 2 | \n",
- " 951001 | \n",
- " 1 | \n",
- " 1 | \n",
- " 19700510 | \n",
- " 425 | \n",
- " ... | \n",
- " | \n",
- " 0 | \n",
- " 0 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 35 | \n",
- "
\n",
- " \n",
- "
\n",
- "
227837 rows × 50 columns
\n",
- "
"
- ],
- "text/plain": [
- " contador CARTORIO REGISTRO DATAREG TIPOBITO DATAOBITO ESTCIVIL \\\n",
- "0 180001 951006 2 951002 2 \n",
- "1 180002 951006 2 951002 3 \n",
- "2 180003 951006 2 951003 2 \n",
- "3 180004 951006 2 951003 3 \n",
- "4 180005 951006 2 951004 1 \n",
- "... ... ... ... ... ... ... ... \n",
- "227832 179996 951004 2 951001 4 \n",
- "227833 179997 951004 2 951001 2 \n",
- "227834 179998 951004 2 951001 3 \n",
- "227835 179999 951006 2 951001 1 \n",
- "227836 180000 951006 2 951001 1 \n",
- "\n",
- " SEXO DATANASC IDADE ... FONTINFO ACIDTRAB LOCACID CRITICA \\\n",
- "0 1 19291003 465 ... 0 0 \n",
- "1 2 18980317 497 ... 0 0 \n",
- "2 2 19281002 467 ... 0 0 \n",
- "3 1 19110613 484 ... 0 0 \n",
- "4 1 19610914 434 ... 0 0 \n",
- "... ... ... ... ... ... ... ... ... \n",
- "227832 1 19380423 457 ... 0 0 \n",
- "227833 1 19470130 448 ... 0 0 \n",
- "227834 2 19160113 479 ... 0 0 \n",
- "227835 1 19550901 440 ... 0 0 \n",
- "227836 1 19700510 425 ... 0 0 \n",
- "\n",
- " NUMEXPORT CRSOCOR CRSRES RACACOR ETNIA UFINFORM \n",
- "0 35 \n",
- "1 35 \n",
- "2 35 \n",
- "3 35 \n",
- "4 35 \n",
- "... ... ... ... ... ... ... \n",
- "227832 35 \n",
- "227833 35 \n",
- "227834 35 \n",
- "227835 35 \n",
- "227836 35 \n",
- "\n",
- "[227837 rows x 50 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet.to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- },
- "vscode": {
- "interpreter": {
- "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/source/databases/SINAN.ipynb b/docs/source/databases/SINAN.ipynb
deleted file mode 100644
index 48318721..00000000
--- a/docs/source/databases/SINAN.ipynb
+++ /dev/null
@@ -1,1222 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "# SINAN FTP Database \n",
- "##### Available diseases and years to download\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "from pysus import SINAN\n",
- "sinan = SINAN().load() # Loads the files from DATASUS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Doenças e Agravos de Notificação',\n",
- " 'source': 'https://portalsinan.saude.gov.br/',\n",
- " 'description': 'The Notifiable Diseases Information System - Sinan is primarilyfed by the notification and investigation of cases of diseases and conditions listed in the national list of compulsorily notifiable diseases (Consolidation Ordinance No. 4, September 28, 2017, Annex).However, states and municipalities are allowed to include other important health problems in their region, such as difilobotriasis in the municipality of São Paulo. Its effective use enables the dynamic diagnosis of the occurrence of an event in the population, providing evidence for causal explanations of compulsorily notifiable diseases and indicating risks to which people are exposed. This contributes to identifying the epidemiological reality of a specific geographical area. Its systematic, decentralized use contributes to the democratization of information, allowing all healthcare professionals to access and make it available to the community. Therefore, it is a relevant tool to assist in health planning, define intervention priorities, and evaluate the impact of interventions.'}"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.metadata"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Listing SINAN Codes & Diseases"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'ACBI': 'Acidente de trabalho com material biológico',\n",
- " 'ACGR': 'Acidente de trabalho',\n",
- " 'ANIM': 'Acidente por Animais Peçonhentos',\n",
- " 'ANTR': 'Atendimento Antirrabico',\n",
- " 'BOTU': 'Botulismo',\n",
- " 'CANC': 'Cancêr relacionado ao trabalho',\n",
- " 'CHAG': 'Doença de Chagas Aguda',\n",
- " 'CHIK': 'Febre de Chikungunya',\n",
- " 'COLE': 'Cólera',\n",
- " 'COQU': 'Coqueluche',\n",
- " 'DENG': 'Dengue',\n",
- " 'DERM': 'Dermatoses ocupacionais',\n",
- " 'DIFT': 'Difteria',\n",
- " 'ESQU': 'Esquistossomose',\n",
- " 'EXAN': 'Doença exantemáticas',\n",
- " 'FMAC': 'Febre Maculosa',\n",
- " 'FTIF': 'Febre Tifóide',\n",
- " 'HANS': 'Hanseníase',\n",
- " 'HANT': 'Hantavirose',\n",
- " 'HEPA': 'Hepatites Virais',\n",
- " 'IEXO': 'Intoxicação Exógena',\n",
- " 'INFL': 'Influenza Pandêmica',\n",
- " 'LEIV': 'Leishmaniose Visceral',\n",
- " 'LEPT': 'Leptospirose',\n",
- " 'LERD': 'LER/Dort',\n",
- " 'LTAN': 'Leishmaniose Tegumentar Americana',\n",
- " 'MALA': 'Malária',\n",
- " 'MENI': 'Meningite',\n",
- " 'MENT': 'Transtornos mentais relacionados ao trabalho',\n",
- " 'NTRA': 'Notificação de Tracoma',\n",
- " 'PAIR': 'Perda auditiva por ruído relacionado ao trabalho',\n",
- " 'PEST': 'Peste',\n",
- " 'PFAN': 'Paralisia Flácida Aguda',\n",
- " 'PNEU': 'Pneumoconioses realacionadas ao trabalho',\n",
- " 'RAIV': 'Raiva',\n",
- " 'SDTA': 'Surto Doenças Transmitidas por Alimentos',\n",
- " 'SIFA': 'Sífilis Adquirida',\n",
- " 'SIFC': 'Sífilis Congênita',\n",
- " 'SIFG': 'Sífilis em Gestante',\n",
- " 'SRC': 'Síndrome da Rubéola Congênia',\n",
- " 'TETA': 'Tétano Acidental',\n",
- " 'TETN': 'Tétano Neonatal',\n",
- " 'TOXC': 'Toxoplasmose Congênita',\n",
- " 'TOXG': 'Toxoplasmose Gestacional',\n",
- " 'TRAC': 'Inquérito de Tracoma',\n",
- " 'TUBE': 'Tuberculose',\n",
- " 'VARC': 'Varicela',\n",
- " 'VIOL': 'Violência doméstica, sexual e/ou outras violências',\n",
- " 'ZIKA': 'Zika Vírus'}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.diseases"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Getting specific files "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[CHIKBR15.dbc,\n",
- " CHIKBR16.dbc,\n",
- " CHIKBR17.dbc,\n",
- " CHIKBR18.dbc,\n",
- " CHIKBR19.dbc,\n",
- " CHIKBR20.dbc,\n",
- " CHIKBR21.dbc,\n",
- " CHIKBR22.dbc,\n",
- " CHIKBR23.dbc,\n",
- " ZIKABR16.dbc,\n",
- " ZIKABR17.dbc,\n",
- " ZIKABR18.dbc,\n",
- " ZIKABR19.dbc,\n",
- " ZIKABR20.dbc,\n",
- " ZIKABR21.dbc,\n",
- " ZIKABR22.dbc,\n",
- " ZIKABR23.dbc]"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.get_files(dis_code=[\"ZIKA\", \"CHIK\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[BOTUBR23.dbc,\n",
- " CHIKBR23.dbc,\n",
- " DENGBR23.dbc,\n",
- " ESQUBR23.dbc,\n",
- " FTIFBR23.dbc,\n",
- " HANSBR23.dbc,\n",
- " MENIBR23.dbc,\n",
- " TOXCBR23.dbc,\n",
- " TOXGBR23.dbc,\n",
- " TUBEBR23.dbc,\n",
- " VARCBR23.dbc,\n",
- " ZIKABR23.dbc]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.get_files(year=2023)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[DENGBR22.dbc, DENGBR23.dbc, ZIKABR22.dbc, ZIKABR23.dbc]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.get_files(dis_code=[\"DENG\", \"ZIKA\"], year=[2022, 2023])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "files = sinan.get_files(dis_code=\"BOTU\", year=[2007, 2008])\n",
- "botu_2007, botu_2008 = files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Describing a file inside DATASUS server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'BOTUBR07.dbc',\n",
- " 'disease': 'Botulismo',\n",
- " 'year': 2007,\n",
- " 'size': '7.5 kB',\n",
- " 'last_update': '2021-11-23 11:55AM'}"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.describe(botu_2007)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'name': 'BOTUBR08.dbc',\n",
- " 'disease': 'Botulismo',\n",
- " 'year': 2008,\n",
- " 'size': '8.3 kB',\n",
- " 'last_update': '2021-11-23 11:56AM'}"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.describe(botu_2008)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Downloading files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can rather download multiple files or download them individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████| 8.35k/8.35k [00:00<00:00, 6.12MB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/BOTUBR07.parquet, /home/bida/pysus/BOTUBR08.parquet]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinan.download(files) # Downloads to default directory"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "/home/bida/pysus/BOTUBR07.parquet"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "botu_2007.download() # or specify a custom directory with `local_dir=`"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "parquet = botu_2007.download()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " TP_NOT | \n",
- " ID_AGRAVO | \n",
- " DT_NOTIFIC | \n",
- " SEM_NOT | \n",
- " NU_ANO | \n",
- " SG_UF_NOT | \n",
- " ID_MUNICIP | \n",
- " ID_REGIONA | \n",
- " ID_UNIDADE | \n",
- " DT_SIN_PRI | \n",
- " ... | \n",
- " NU_PROTEI | \n",
- " DT_LIQUOR | \n",
- " TP_SENSITI | \n",
- " TP_MOTORA | \n",
- " TP_REPETE | \n",
- " DS_ALI1 | \n",
- " DS_ALI2 | \n",
- " DS_LOCAL1 | \n",
- " DS_LOCAL2 | \n",
- " DT_ENCERRA | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-01-12 | \n",
- " 200702 | \n",
- " 2007 | \n",
- " 41 | \n",
- " 410690 | \n",
- " 1356 | \n",
- " 2384299 | \n",
- " 2006-12-31 | \n",
- " ... | \n",
- " 53 | \n",
- " 20070112 | \n",
- " 2 | \n",
- " 2 | \n",
- " 3 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070222 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-01-12 | \n",
- " 200702 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 355030 | \n",
- " 1331 | \n",
- " 2077485 | \n",
- " 2007-01-10 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " TORTA DE PALMITO/ FRANGO . | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070323 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-01-03 | \n",
- " 200701 | \n",
- " 2007 | \n",
- " 52 | \n",
- " 521180 | \n",
- " 1791 | \n",
- " 2381532 | \n",
- " 2006-11-16 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070515 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-01-08 | \n",
- " 200702 | \n",
- " 2007 | \n",
- " 43 | \n",
- " 431410 | \n",
- " 1611 | \n",
- " 2246988 | \n",
- " 2007-01-03 | \n",
- " ... | \n",
- " 31 | \n",
- " 20070107 | \n",
- " | \n",
- " | \n",
- " | \n",
- " SALAME | \n",
- " | \n",
- " DOMICILIO | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-02-27 | \n",
- " 200709 | \n",
- " 2007 | \n",
- " 50 | \n",
- " 500830 | \n",
- " 1973 | \n",
- " 2757206 | \n",
- " 2007-02-27 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070816 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-02-16 | \n",
- " 200707 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 354340 | \n",
- " 1348 | \n",
- " 2082187 | \n",
- " 2006-12-12 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-02-15 | \n",
- " 200707 | \n",
- " 2007 | \n",
- " 33 | \n",
- " 330170 | \n",
- " | \n",
- " 2290227 | \n",
- " 2007-02-12 | \n",
- " ... | \n",
- " 27 | \n",
- " 20070214 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070724 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-02-28 | \n",
- " 200709 | \n",
- " 2007 | \n",
- " 22 | \n",
- " 220800 | \n",
- " 1888 | \n",
- " 4009622 | \n",
- " 2007-02-28 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070503 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-05-12 | \n",
- " 200719 | \n",
- " 2007 | \n",
- " 27 | \n",
- " 270710 | \n",
- " 1539 | \n",
- " 2721643 | \n",
- " 2007-05-09 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070716 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-06-11 | \n",
- " 200724 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 354850 | \n",
- " 1349 | \n",
- " 2025752 | \n",
- " 2007-06-11 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20071120 | \n",
- "
\n",
- " \n",
- " | 10 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-06-22 | \n",
- " 200725 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 352340 | \n",
- " 1342 | \n",
- " 2023709 | \n",
- " 2007-06-12 | \n",
- " ... | \n",
- " 40 | \n",
- " 20070616 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070704 | \n",
- "
\n",
- " \n",
- " | 11 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-06-27 | \n",
- " 200726 | \n",
- " 2007 | \n",
- " 29 | \n",
- " 292740 | \n",
- " 1380 | \n",
- " 0004057 | \n",
- " 2007-06-25 | \n",
- " ... | \n",
- " 39 | \n",
- " 20070625 | \n",
- " 2 | \n",
- " 2 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070730 | \n",
- "
\n",
- " \n",
- " | 12 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-07-30 | \n",
- " 200731 | \n",
- " 2007 | \n",
- " 53 | \n",
- " 530010 | \n",
- " | \n",
- " 0010456 | \n",
- " 2007-07-26 | \n",
- " ... | \n",
- " 83 | \n",
- " 20070728 | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070730 | \n",
- "
\n",
- " \n",
- " | 13 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-07-10 | \n",
- " 200728 | \n",
- " 2007 | \n",
- " 52 | \n",
- " 520870 | \n",
- " 1779 | \n",
- " 2338262 | \n",
- " 2007-05-08 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070814 | \n",
- "
\n",
- " \n",
- " | 14 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-08-28 | \n",
- " 200735 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 354850 | \n",
- " 1349 | \n",
- " 2079720 | \n",
- " 2007-08-17 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " SALGADO TORTA PAO COM QUEIJO C | \n",
- " | \n",
- " CASA DE MASSAS ROMANA EM SANTO | \n",
- " | \n",
- " 20070903 | \n",
- "
\n",
- " \n",
- " | 15 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-08-24 | \n",
- " 200734 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 354850 | \n",
- " 1349 | \n",
- " 2079720 | \n",
- " 2007-08-17 | \n",
- " ... | \n",
- " | \n",
- " 20070819 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20071025 | \n",
- "
\n",
- " \n",
- " | 16 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-08-17 | \n",
- " 200733 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 355030 | \n",
- " 1331 | \n",
- " 2077574 | \n",
- " 2007-08-12 | \n",
- " ... | \n",
- " 40 | \n",
- " 20070813 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 17 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-08-20 | \n",
- " 200734 | \n",
- " 2007 | \n",
- " 51 | \n",
- " 510340 | \n",
- " 1578 | \n",
- " 2495015 | \n",
- " 2007-08-18 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20070918 | \n",
- "
\n",
- " \n",
- " | 18 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-08-23 | \n",
- " 200734 | \n",
- " 2007 | \n",
- " 23 | \n",
- " 230440 | \n",
- " 1519 | \n",
- " 2482169 | \n",
- " 2007-08-19 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " MORTADELA | \n",
- " | \n",
- " DOMICILIO | \n",
- " | \n",
- " 20071023 | \n",
- "
\n",
- " \n",
- " | 19 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-09-16 | \n",
- " 200738 | \n",
- " 2007 | \n",
- " 31 | \n",
- " 317010 | \n",
- " 1461 | \n",
- " 2206595 | \n",
- " 2007-09-08 | \n",
- " ... | \n",
- " | \n",
- " 20070915 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20071017 | \n",
- "
\n",
- " \n",
- " | 20 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-09-06 | \n",
- " 200736 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 354340 | \n",
- " 1348 | \n",
- " 2082187 | \n",
- " 2007-08-31 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 21 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-09-04 | \n",
- " 200736 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 355220 | \n",
- " 1353 | \n",
- " 2081695 | \n",
- " 2007-08-31 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 22 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-10-08 | \n",
- " 200741 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 355220 | \n",
- " 1353 | \n",
- " 2081695 | \n",
- " 2007-10-08 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " | 23 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-10-02 | \n",
- " 200740 | \n",
- " 2007 | \n",
- " 31 | \n",
- " 314330 | \n",
- " 1473 | \n",
- " 2149990 | \n",
- " 2007-09-28 | \n",
- " ... | \n",
- " 19 | \n",
- " 20070930 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20080601 | \n",
- "
\n",
- " \n",
- " | 24 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-11-06 | \n",
- " 200745 | \n",
- " 2007 | \n",
- " 35 | \n",
- " 355220 | \n",
- " 1353 | \n",
- " 2078732 | \n",
- " 2007-11-03 | \n",
- " ... | \n",
- " | \n",
- " 20071105 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20080108 | \n",
- "
\n",
- " \n",
- " | 25 | \n",
- " 2 | \n",
- " A051 | \n",
- " 2007-11-01 | \n",
- " 200744 | \n",
- " 2007 | \n",
- " 52 | \n",
- " 520870 | \n",
- " 1779 | \n",
- " 2518406 | \n",
- " 2007-10-02 | \n",
- " ... | \n",
- " | \n",
- " | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " 20071112 | \n",
- "
\n",
- " \n",
- "
\n",
- "
26 rows × 140 columns
\n",
- "
"
- ],
- "text/plain": [
- " TP_NOT ID_AGRAVO DT_NOTIFIC SEM_NOT NU_ANO SG_UF_NOT ID_MUNICIP \\\n",
- "0 2 A051 2007-01-12 200702 2007 41 410690 \n",
- "1 2 A051 2007-01-12 200702 2007 35 355030 \n",
- "2 2 A051 2007-01-03 200701 2007 52 521180 \n",
- "3 2 A051 2007-01-08 200702 2007 43 431410 \n",
- "4 2 A051 2007-02-27 200709 2007 50 500830 \n",
- "5 2 A051 2007-02-16 200707 2007 35 354340 \n",
- "6 2 A051 2007-02-15 200707 2007 33 330170 \n",
- "7 2 A051 2007-02-28 200709 2007 22 220800 \n",
- "8 2 A051 2007-05-12 200719 2007 27 270710 \n",
- "9 2 A051 2007-06-11 200724 2007 35 354850 \n",
- "10 2 A051 2007-06-22 200725 2007 35 352340 \n",
- "11 2 A051 2007-06-27 200726 2007 29 292740 \n",
- "12 2 A051 2007-07-30 200731 2007 53 530010 \n",
- "13 2 A051 2007-07-10 200728 2007 52 520870 \n",
- "14 2 A051 2007-08-28 200735 2007 35 354850 \n",
- "15 2 A051 2007-08-24 200734 2007 35 354850 \n",
- "16 2 A051 2007-08-17 200733 2007 35 355030 \n",
- "17 2 A051 2007-08-20 200734 2007 51 510340 \n",
- "18 2 A051 2007-08-23 200734 2007 23 230440 \n",
- "19 2 A051 2007-09-16 200738 2007 31 317010 \n",
- "20 2 A051 2007-09-06 200736 2007 35 354340 \n",
- "21 2 A051 2007-09-04 200736 2007 35 355220 \n",
- "22 2 A051 2007-10-08 200741 2007 35 355220 \n",
- "23 2 A051 2007-10-02 200740 2007 31 314330 \n",
- "24 2 A051 2007-11-06 200745 2007 35 355220 \n",
- "25 2 A051 2007-11-01 200744 2007 52 520870 \n",
- "\n",
- " ID_REGIONA ID_UNIDADE DT_SIN_PRI ... NU_PROTEI DT_LIQUOR TP_SENSITI \\\n",
- "0 1356 2384299 2006-12-31 ... 53 20070112 2 \n",
- "1 1331 2077485 2007-01-10 ... \n",
- "2 1791 2381532 2006-11-16 ... \n",
- "3 1611 2246988 2007-01-03 ... 31 20070107 \n",
- "4 1973 2757206 2007-02-27 ... \n",
- "5 1348 2082187 2006-12-12 ... 1 \n",
- "6 2290227 2007-02-12 ... 27 20070214 \n",
- "7 1888 4009622 2007-02-28 ... \n",
- "8 1539 2721643 2007-05-09 ... \n",
- "9 1349 2025752 2007-06-11 ... \n",
- "10 1342 2023709 2007-06-12 ... 40 20070616 \n",
- "11 1380 0004057 2007-06-25 ... 39 20070625 2 \n",
- "12 0010456 2007-07-26 ... 83 20070728 1 \n",
- "13 1779 2338262 2007-05-08 ... \n",
- "14 1349 2079720 2007-08-17 ... \n",
- "15 1349 2079720 2007-08-17 ... 20070819 \n",
- "16 1331 2077574 2007-08-12 ... 40 20070813 \n",
- "17 1578 2495015 2007-08-18 ... \n",
- "18 1519 2482169 2007-08-19 ... \n",
- "19 1461 2206595 2007-09-08 ... 20070915 \n",
- "20 1348 2082187 2007-08-31 ... \n",
- "21 1353 2081695 2007-08-31 ... \n",
- "22 1353 2081695 2007-10-08 ... \n",
- "23 1473 2149990 2007-09-28 ... 19 20070930 \n",
- "24 1353 2078732 2007-11-03 ... 20071105 \n",
- "25 1779 2518406 2007-10-02 ... 1 \n",
- "\n",
- " TP_MOTORA TP_REPETE DS_ALI1 DS_ALI2 \\\n",
- "0 2 3 \n",
- "1 TORTA DE PALMITO/ FRANGO . \n",
- "2 \n",
- "3 SALAME \n",
- "4 \n",
- "5 2 2 \n",
- "6 \n",
- "7 \n",
- "8 \n",
- "9 \n",
- "10 \n",
- "11 2 \n",
- "12 2 2 \n",
- "13 \n",
- "14 SALGADO TORTA PAO COM QUEIJO C \n",
- "15 \n",
- "16 \n",
- "17 \n",
- "18 MORTADELA \n",
- "19 \n",
- "20 \n",
- "21 \n",
- "22 \n",
- "23 \n",
- "24 \n",
- "25 1 1 \n",
- "\n",
- " DS_LOCAL1 DS_LOCAL2 DT_ENCERRA \n",
- "0 20070222 \n",
- "1 20070323 \n",
- "2 20070515 \n",
- "3 DOMICILIO \n",
- "4 20070816 \n",
- "5 \n",
- "6 20070724 \n",
- "7 20070503 \n",
- "8 20070716 \n",
- "9 20071120 \n",
- "10 20070704 \n",
- "11 20070730 \n",
- "12 20070730 \n",
- "13 20070814 \n",
- "14 CASA DE MASSAS ROMANA EM SANTO 20070903 \n",
- "15 20071025 \n",
- "16 \n",
- "17 20070918 \n",
- "18 DOMICILIO 20071023 \n",
- "19 20071017 \n",
- "20 \n",
- "21 \n",
- "22 \n",
- "23 20080601 \n",
- "24 20080108 \n",
- "25 20071112 \n",
- "\n",
- "[26 rows x 140 columns]"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet.to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- },
- "vscode": {
- "interpreter": {
- "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/source/databases/SINASC.ipynb b/docs/source/databases/SINASC.ipynb
deleted file mode 100644
index 097e9c40..00000000
--- a/docs/source/databases/SINASC.ipynb
+++ /dev/null
@@ -1,693 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "# SINASC FTP Database\n",
- "##### Sistema de Informações sobre Nascidos Vivos\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "from pysus import SINASC\n",
- "sinasc = SINASC().load() # Loads the files from DATASUS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SINASC - Sistema de Informações sobre Nascidos Vivos"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinasc"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'long_name': 'Sistema de Informações sobre Nascidos Vivos',\n",
- " 'source': 'http://sinasc.saude.gov.br/',\n",
- " 'description': ''}"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinasc.metadata"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Listing codes & groups"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'DN': 'Declarações de Nascidos Vivos',\n",
- " 'DNR': 'Dados dos Nascidos Vivos por UF de residência'}"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinasc.groups"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Getting specific files "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[DNSP1996.DBC,\n",
- " DNSP1997.DBC,\n",
- " DNSP1998.DBC,\n",
- " DNSP1999.DBC,\n",
- " DNSP2000.DBC,\n",
- " DNSP2001.DBC,\n",
- " DNSP2002.DBC,\n",
- " DNSP2003.DBC,\n",
- " DNSP2004.DBC,\n",
- " DNSP2005.dbc,\n",
- " DNSP2006.DBC,\n",
- " DNSP2007.dbc,\n",
- " DNSP2008.dbc,\n",
- " DNSP2009.dbc,\n",
- " DNSP2010.DBC,\n",
- " DNSP2011.DBC,\n",
- " DNSP2012.DBC,\n",
- " DNSP2013.dbc,\n",
- " DNSP2014.dbc,\n",
- " DNSP2015.dbc,\n",
- " DNSP2016.dbc,\n",
- " DNSP2017.dbc,\n",
- " DNSP2018.dbc,\n",
- " DNSP2019.dbc,\n",
- " DNSP2020.dbc,\n",
- " DNSP2021.dbc]"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinasc.get_files(\"DN\", uf=\"SP\") # or multiple [\"SP\", ...]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[DNSP1999.DBC, DNSP2000.DBC]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "files = sinasc.get_files(\"DN\", uf=\"SP\", year=[1999, 2000])\n",
- "files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Describing files inside DATASUS server"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'group': 'Declarações de Nascidos Vivos',\n",
- " 'last_update': '2020-01-27 12:12PM',\n",
- " 'name': 'DNSP1999.DBC',\n",
- " 'size': '14.3 MB',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 1999}\n",
- "{'group': 'Declarações de Nascidos Vivos',\n",
- " 'last_update': '2020-01-27 12:12PM',\n",
- " 'name': 'DNSP2000.DBC',\n",
- " 'size': '14.3 MB',\n",
- " 'uf': 'São Paulo',\n",
- " 'year': 2000}\n"
- ]
- }
- ],
- "source": [
- "from pprint import pprint\n",
- "for file in files:\n",
- " pprint(sinasc.describe(file))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Downloading files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can rather download multiple files or download them individually:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "DNSP2000.parquet: 100%|████████████| 523k/523k [00:15<00:00, 34.3kB/s]\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "[/home/bida/pysus/DNSP1999.parquet, /home/bida/pysus/DNSP2000.parquet]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sinasc.download(files) # Downloads to default directory or specify with `local_dir=`"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "@Note: If the file has been downloaded already, it's required to delete it in order to download the lastest updated file from DATASUS. "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Reading files\n",
- "\n",
- "PySUS uses Parquets as output, use the method `to_dataframe()` to read the file as pandas DataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|████████████████████████████| 14.3M/14.3M [00:00<00:00, 6.32GB/s]\n"
- ]
- }
- ],
- "source": [
- "parquet = sinasc.download(files)[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " contador | \n",
- " LOCNASC | \n",
- " CODMUNNASC | \n",
- " IDADEMAE | \n",
- " ESTCIVMAE | \n",
- " ESCMAE | \n",
- " CODOCUPMAE | \n",
- " QTDFILVIVO | \n",
- " QTDFILMORT | \n",
- " CODMUNRES | \n",
- " ... | \n",
- " GRAVIDEZ | \n",
- " PARTO | \n",
- " CONSULTAS | \n",
- " DTNASC | \n",
- " SEXO | \n",
- " APGAR1 | \n",
- " APGAR5 | \n",
- " RACACOR | \n",
- " PESO | \n",
- " CODANOMAL | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 90001 | \n",
- " 1 | \n",
- " 3550308 | \n",
- " 21 | \n",
- " 9 | \n",
- " 2 | \n",
- " | \n",
- " 03 | \n",
- " 00 | \n",
- " 3550308 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " 20021999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 10 | \n",
- " | \n",
- " 3300 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 90002 | \n",
- " 1 | \n",
- " 3550308 | \n",
- " 23 | \n",
- " 9 | \n",
- " 3 | \n",
- " | \n",
- " 01 | \n",
- " 00 | \n",
- " 3550308 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " 19031999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 10 | \n",
- " | \n",
- " 2300 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 90003 | \n",
- " 1 | \n",
- " 3550308 | \n",
- " 19 | \n",
- " 9 | \n",
- " 2 | \n",
- " | \n",
- " 01 | \n",
- " 00 | \n",
- " 3550308 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " 09031999 | \n",
- " 1 | \n",
- " 08 | \n",
- " 09 | \n",
- " 4 | \n",
- " 3600 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 90004 | \n",
- " 1 | \n",
- " 3550308 | \n",
- " 16 | \n",
- " 9 | \n",
- " 2 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 3550308 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " 15031999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 10 | \n",
- " 1 | \n",
- " 2600 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 90005 | \n",
- " 1 | \n",
- " 3550308 | \n",
- " 33 | \n",
- " 9 | \n",
- " 2 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 3550308 | \n",
- " ... | \n",
- " 1 | \n",
- " 2 | \n",
- " 8 | \n",
- " 17031999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 09 | \n",
- " | \n",
- " 2700 | \n",
- " | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 714423 | \n",
- " 299996 | \n",
- " 1 | \n",
- " 3522604 | \n",
- " 18 | \n",
- " 9 | \n",
- " 3 | \n",
- " | \n",
- " 01 | \n",
- " 00 | \n",
- " 3522604 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 20051999 | \n",
- " 1 | \n",
- " 08 | \n",
- " 10 | \n",
- " 1 | \n",
- " 3400 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 714424 | \n",
- " 299997 | \n",
- " 1 | \n",
- " 3522604 | \n",
- " 37 | \n",
- " 2 | \n",
- " 2 | \n",
- " | \n",
- " 01 | \n",
- " 00 | \n",
- " 3522604 | \n",
- " ... | \n",
- " 1 | \n",
- " 2 | \n",
- " 4 | \n",
- " 29051999 | \n",
- " 1 | \n",
- " 09 | \n",
- " 10 | \n",
- " 1 | \n",
- " 3800 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 714425 | \n",
- " 299998 | \n",
- " 1 | \n",
- " 3522604 | \n",
- " 21 | \n",
- " 9 | \n",
- " 4 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 3522604 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 04061999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 10 | \n",
- " 1 | \n",
- " 3500 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 714426 | \n",
- " 299999 | \n",
- " 1 | \n",
- " 3522604 | \n",
- " 34 | \n",
- " 2 | \n",
- " 4 | \n",
- " | \n",
- " 00 | \n",
- " 00 | \n",
- " 3522604 | \n",
- " ... | \n",
- " 1 | \n",
- " 2 | \n",
- " 3 | \n",
- " 04061999 | \n",
- " 2 | \n",
- " 09 | \n",
- " 10 | \n",
- " 1 | \n",
- " 3800 | \n",
- " | \n",
- "
\n",
- " \n",
- " | 714427 | \n",
- " 300000 | \n",
- " 1 | \n",
- " 3522604 | \n",
- " 37 | \n",
- " 9 | \n",
- " 2 | \n",
- " | \n",
- " 05 | \n",
- " 00 | \n",
- " 3522604 | \n",
- " ... | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " 29051999 | \n",
- " 2 | \n",
- " 10 | \n",
- " 10 | \n",
- " 1 | \n",
- " 3100 | \n",
- " | \n",
- "
\n",
- " \n",
- "
\n",
- "
714428 rows × 21 columns
\n",
- "
"
- ],
- "text/plain": [
- " contador LOCNASC CODMUNNASC IDADEMAE ESTCIVMAE ESCMAE CODOCUPMAE \\\n",
- "0 90001 1 3550308 21 9 2 \n",
- "1 90002 1 3550308 23 9 3 \n",
- "2 90003 1 3550308 19 9 2 \n",
- "3 90004 1 3550308 16 9 2 \n",
- "4 90005 1 3550308 33 9 2 \n",
- "... ... ... ... ... ... ... ... \n",
- "714423 299996 1 3522604 18 9 3 \n",
- "714424 299997 1 3522604 37 2 2 \n",
- "714425 299998 1 3522604 21 9 4 \n",
- "714426 299999 1 3522604 34 2 4 \n",
- "714427 300000 1 3522604 37 9 2 \n",
- "\n",
- " QTDFILVIVO QTDFILMORT CODMUNRES ... GRAVIDEZ PARTO CONSULTAS \\\n",
- "0 03 00 3550308 ... 1 1 8 \n",
- "1 01 00 3550308 ... 1 1 8 \n",
- "2 01 00 3550308 ... 1 1 8 \n",
- "3 00 00 3550308 ... 1 1 8 \n",
- "4 00 00 3550308 ... 1 2 8 \n",
- "... ... ... ... ... ... ... ... \n",
- "714423 01 00 3522604 ... 1 1 4 \n",
- "714424 01 00 3522604 ... 1 2 4 \n",
- "714425 00 00 3522604 ... 1 1 4 \n",
- "714426 00 00 3522604 ... 1 2 3 \n",
- "714427 05 00 3522604 ... 1 1 8 \n",
- "\n",
- " DTNASC SEXO APGAR1 APGAR5 RACACOR PESO CODANOMAL \n",
- "0 20021999 2 09 10 3300 \n",
- "1 19031999 2 09 10 2300 \n",
- "2 09031999 1 08 09 4 3600 \n",
- "3 15031999 2 09 10 1 2600 \n",
- "4 17031999 2 09 09 2700 \n",
- "... ... ... ... ... ... ... ... \n",
- "714423 20051999 1 08 10 1 3400 \n",
- "714424 29051999 1 09 10 1 3800 \n",
- "714425 04061999 2 09 10 1 3500 \n",
- "714426 04061999 2 09 10 1 3800 \n",
- "714427 29051999 2 10 10 1 3100 \n",
- "\n",
- "[714428 rows x 21 columns]"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "parquet.to_dataframe()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
- },
- "vscode": {
- "interpreter": {
- "hash": "2a96a5ccec8dfcba7d06b2e71f6eef3b5dac5716461bf5d73ea1bb7ee462cdaa"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b1f9446d..9b036f3b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -14,7 +14,7 @@ Contents:
:maxdepth: 2
Data Sources
- Tutorials
+ API Reference
Indices and tables
diff --git a/pysus/api/__init__.py b/pysus/api/__init__.py
index 1af15464..44fc4270 100644
--- a/pysus/api/__init__.py
+++ b/pysus/api/__init__.py
@@ -1 +1,7 @@
+"""PySUS public API for accessing Brazilian public health data.
+
+Provides clients for DuckLake, FTP, and DadosGov data sources,
+file format handlers, and high-level convenience functions.
+"""
+
from .client import PySUS as PySUSClient # noqa
diff --git a/pysus/api/_impl/__init__.py b/pysus/api/_impl/__init__.py
index e69de29b..f84b8bfe 100644
--- a/pysus/api/_impl/__init__.py
+++ b/pysus/api/_impl/__init__.py
@@ -0,0 +1,6 @@
+"""Implementation module for high-level data access functions.
+
+Exposes convenience functions (sinan, sim, sih, etc.) that
+combine catalog querying, downloading, and Parquet reading
+into a single call.
+"""
diff --git a/pysus/api/_impl/databases.py b/pysus/api/_impl/databases.py
index fc8aecab..e36b2a11 100644
--- a/pysus/api/_impl/databases.py
+++ b/pysus/api/_impl/databases.py
@@ -1,3 +1,12 @@
+"""High-level convenience functions for fetching Brazilian health data.
+
+Each function wraps an asynchronous query/download pipeline and returns a
+pandas DataFrame. The available datasets cover disease notification (SINAN),
+vital statistics (SINASC, SIM), hospital admissions (SIH), ambulatory care
+(SIA), immunisation (PNI), census data (IBGE), health facilities (CNES),
+and hospitalisation records (CIHA).
+"""
+
__all__ = [
"sinan",
"sinasc",
@@ -32,6 +41,8 @@ def _fetch_data(
show_progress: bool = True,
**kwargs,
) -> pd.DataFrame:
+ """Query, download, and concatenate Parquet files for a given dataset."""
+
async def _fetch():
async with PySUS() as pysus:
years = [year] if isinstance(year, int) else (year or [None])
@@ -77,10 +88,61 @@ async def _fetch():
def sinan(
- disease: str,
+ disease: Literal[
+ "ACBI",
+ "ACGR",
+ "ANIM",
+ "ANTR",
+ "BOTU",
+ "CANC",
+ "CHAG",
+ "CHIK",
+ "COLE",
+ "COQU",
+ "DENG",
+ "DERM",
+ "DIFT",
+ "ESQU",
+ "EXAN",
+ "FMAC",
+ "FTIF",
+ "HANS",
+ "HANT",
+ "HEPA",
+ "IEXO",
+ "INFL",
+ "LEIV",
+ "LEPT",
+ "LERD",
+ "LTAN",
+ "MALA",
+ "MENI",
+ "MENT",
+ "NTRA",
+ "PAIR",
+ "PEST",
+ "PFAN",
+ "PNEU",
+ "RAIV",
+ "SDTA",
+ "SIFA",
+ "SIFC",
+ "SIFG",
+ "SRC",
+ "TETA",
+ "TETN",
+ "TOXC",
+ "TOXG",
+ "TRAC",
+ "TUBE",
+ "VARC",
+ "VIOL",
+ "ZIKA",
+ ],
year: int | list[int],
**kwargs,
) -> pd.DataFrame:
+ """Fetch SINAN records for a given disease and year(s)."""
return _fetch_data(
dataset="sinan",
group=disease.upper(),
@@ -94,6 +156,7 @@ def sinasc(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch SINASC birth certificates for a given state, year(s), and group."""
return _fetch_data(
dataset="sinasc",
state=state.upper(),
@@ -108,6 +171,7 @@ def sim(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch SIM mortality records for a given state, year(s), and group."""
return _fetch_data(
dataset="sim",
state=state.upper(),
@@ -123,6 +187,7 @@ def sih(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch SIH hospital admissions for a state, year, month, and group."""
return _fetch_data(
dataset="sih",
state=state.upper(),
@@ -139,6 +204,7 @@ def sia(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch SIA ambulatory care for a state, year, month, and group."""
return _fetch_data(
dataset="sia",
state=state.upper(),
@@ -154,6 +220,7 @@ def pni(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch PNI immunisation records for a given state, year(s), and group."""
return _fetch_data(
dataset="pni",
state=state.upper(),
@@ -167,6 +234,7 @@ def ibge(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch IBGE census data for given year(s) and optional group."""
return _fetch_data(dataset="ibge", group=group, year=year)
@@ -177,6 +245,7 @@ def cnes(
group: str | None = None,
**kwargs,
) -> pd.DataFrame:
+ """Fetch CNES health facilities for a state, year, month, and group."""
return _fetch_data(
dataset="cnes",
state=state.upper(),
@@ -193,6 +262,7 @@ def ciha(
group: str | None = "CIHA",
**kwargs,
) -> pd.DataFrame:
+ """Fetch CIHA hospitalisation records for state, year, month, and group."""
return _fetch_data(
dataset="ciha",
state=state.upper(),
@@ -220,6 +290,8 @@ def list_files(
month: int | list[int] | None = None,
**kwargs,
) -> pd.DataFrame:
+ """List catalog files for a dataset, filtered by group/state/year/month."""
+
async def _list():
async with PySUS() as pysus:
ducklake = await pysus.get_ducklake()
diff --git a/pysus/api/client.py b/pysus/api/client.py
index 707817a3..99763a72 100644
--- a/pysus/api/client.py
+++ b/pysus/api/client.py
@@ -1,3 +1,9 @@
+"""Main orchestrator for the PySUS data pipeline.
+
+Manages file downloads, local state tracking, catalog attachment,
+Parquet conversion, and query execution across multiple backends.
+"""
+
import enum
from collections.abc import Callable
from datetime import datetime
@@ -20,10 +26,12 @@
class Base(DeclarativeBase):
- pass
+ """Base declarative class for SQLAlchemy ORM models."""
class DownloadStatus(enum.Enum):
+ """Download status values tracked for each local file."""
+
PENDING = "pending"
DOWNLOADING = "downloading"
COMPLETED = "completed"
@@ -32,6 +40,8 @@ class DownloadStatus(enum.Enum):
class LocalFileState(Base):
+ """ORM model tracking the state of a downloaded local file."""
+
__tablename__ = "local_file_state"
path: Mapped[str] = mapped_column(String, primary_key=True)
remote_path: Mapped[str] = mapped_column(String, nullable=False)
@@ -54,7 +64,11 @@ class LocalFileState(Base):
class PySUS:
+ """Central orchestrator for downloading and querying PySUS datasets."""
+
def __init__(self, db_path: Path = CACHEPATH / "config.db"):
+ """Initialize PySUS with a DuckDB-backed SQLAlchemy engine."""
+
db_path = Path(db_path)
db_path.parent.mkdir(parents=True, exist_ok=True)
@@ -68,6 +82,8 @@ def __init__(self, db_path: Path = CACHEPATH / "config.db"):
self._dadosgov: DadosGovClient | None = None
async def __aenter__(self):
+ """Set up DuckLake catalog and return self as async context manager."""
+
self._ducklake = DuckLake()
await self._ducklake._load_catalog()
self._attach_client_catalog(
@@ -77,6 +93,8 @@ async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Clean up all client connections and dispose of the engine."""
+
if self._ducklake:
await self._ducklake.close()
if self._ftp:
@@ -86,6 +104,8 @@ async def __aexit__(self, exc_type, exc_val, exc_tb):
self.engine.dispose()
async def get_ducklake(self) -> DuckLake:
+ """Return the DuckLake client, initializing it lazily if needed."""
+
if self._ducklake is None:
self._ducklake = DuckLake()
await self._ducklake._load_catalog()
@@ -96,12 +116,16 @@ async def get_ducklake(self) -> DuckLake:
return self._ducklake
async def get_dadosgov(self, access_token: str | None) -> DadosGovClient:
+ """Return the DadosGov client, connecting lazily if needed."""
+
if self._dadosgov is None:
self._dadosgov = DadosGovClient()
await self._dadosgov.connect(token=access_token)
return self._dadosgov
async def get_ftp(self) -> FTPClient:
+ """Return the FTP client, connecting lazily if needed."""
+
if self._ftp is None:
self._ftp = FTPClient()
await self._ftp.connect()
@@ -111,6 +135,8 @@ async def get_local_file(
self,
file: BaseRemoteFile,
) -> BaseLocalFile | None:
+ """Look up a previously downloaded file by its remote path."""
+
from pysus.api.extensions import ExtensionFactory
client_name = file.client.name.lower()
@@ -138,6 +164,8 @@ async def get_local_file(
return await ExtensionFactory.instantiate(str(record.path))
def _attach_client_catalog(self, name: str, path: str):
+ """Attach an external DuckDB catalog to the engine if not attached."""
+
abs_path = str(Path(path).absolute())
with self.engine.connect() as conn:
q = "SELECT database_name FROM duckdb_databases() WHERE path = ?"
@@ -149,6 +177,8 @@ def _attach_client_catalog(self, name: str, path: str):
)
def _get_dest_path(self, file: BaseRemoteFile) -> Path:
+ """Build the local filesystem path for a given remote file."""
+
client_name = file.client.name.lower()
dataset_name = file.dataset.name.lower()
@@ -174,6 +204,8 @@ async def _update_state(
state: str | None = None,
group: str | None = None,
):
+ """Create or update the LocalFileState record for a file."""
+
with self.Session() as session:
record = (
session.query(LocalFileState)
@@ -204,6 +236,8 @@ async def download(
token: str | None = None,
callback: Callable | None = None,
) -> BaseLocalFile:
+ """Download a remote file and return a local file handle."""
+
from pysus.api.extensions import ExtensionFactory
existing_local = await self.get_local_file(file)
@@ -264,6 +298,8 @@ async def download(
) from e
async def _delete_record(self, path: str):
+ """Delete a LocalFileState record from the database."""
+
with self.Session() as session:
record = session.query(LocalFileState).filter_by(path=path).first()
if record:
@@ -276,6 +312,8 @@ async def download_to_parquet(
token: str | None = None,
callback: Callable[[int, int], None] | None = None,
) -> Parquet:
+ """Download a file and convert it to Parquet format."""
+
local_file = await self.download(
file=file,
token=token,
@@ -308,6 +346,8 @@ async def download_to_parquet(
)
def get_local_hierarchy(self):
+ """Build a nested dict of cached files grouped by client and dataset."""
+
with self.Session() as session:
records = session.query(LocalFileState).all()
@@ -338,6 +378,8 @@ def get_local_hierarchy(self):
return hierarchy
def get_completed_remote_paths(self) -> set[str]:
+ """Return remote paths for all successfully downloaded files."""
+
with self.Session() as session:
records = (
session.query(LocalFileState.remote_path)
@@ -354,6 +396,8 @@ async def query(
year: int | None = None,
month: int | None = None,
):
+ """Query available datasets through the DuckLake catalog."""
+
if self._ducklake is None:
await self.get_ducklake()
if self._ducklake is not None:
@@ -371,6 +415,8 @@ def read_parquet(
sql: str | None = None,
mode: Literal["union", "intersection", "strict"] = "union",
) -> "DuckDBPyConnection":
+ """Read Parquet files with optional schema handling and SQL filter."""
+
if not paths:
raise ValueError("No paths provided")
diff --git a/pysus/api/dadosgov/__init__.py b/pysus/api/dadosgov/__init__.py
index 1efe5d57..30269ecb 100644
--- a/pysus/api/dadosgov/__init__.py
+++ b/pysus/api/dadosgov/__init__.py
@@ -1 +1,3 @@
+"""Client for the Brazilian Open Data Portal (dados.gov.br)."""
+
from .client import DadosGov as DadosGovClient # noqa
diff --git a/pysus/api/dadosgov/client.py b/pysus/api/dadosgov/client.py
index 0051d9e5..4e6cbde8 100644
--- a/pysus/api/dadosgov/client.py
+++ b/pysus/api/dadosgov/client.py
@@ -1,3 +1,5 @@
+"""HTTP client and data models for the dados.gov.br API."""
+
from __future__ import annotations
import pathlib
@@ -15,6 +17,7 @@
def to_datetime(value: Any) -> datetime | None:
+ """Parse a Brazilian date string into a datetime object."""
if not value or not isinstance(value, str) or "Indisponível" in value:
return None
for fmt in ("%d/%m/%Y %H:%M:%S", "%d/%m/%Y"):
@@ -26,6 +29,7 @@ def to_datetime(value: Any) -> datetime | None:
def to_bool(value: Any) -> bool:
+ """Parse a Brazilian Portuguese boolean value ("sim"/"não") into a bool."""
if isinstance(value, bool):
return value
return str(value).lower() in ("sim", "true", "1")
@@ -36,27 +40,34 @@ def to_bool(value: Any) -> bool:
class DadosGov(BaseRemoteClient):
+ """Client for the dados.gov.br open data portal API."""
+
base_url: str = "https://dados.gov.br/dados/api"
_token: str | None = PrivateAttr(default=None)
_client: httpx.AsyncClient | None = PrivateAttr(default=None)
def __init__(self, **data):
+ """Initialize the DadosGov client."""
super().__init__(**data)
@property
def name(self) -> str:
+ """Return the short client name."""
return "DadosGov"
@property
def long_name(self) -> str:
+ """Return the human-readable client name."""
return "Portal Brasileiro de Dados Abertos"
@property
def description(self) -> str:
+ """Return a description of the client."""
return "Interface de acesso ao API do Portal de Dados Abertos"
async def connect(self, token: str | None = None) -> None:
+ """Connect to the dados.gov.br API with the given token."""
_token = token or self._token
if not _token:
@@ -84,19 +95,23 @@ async def connect(self, token: str | None = None) -> None:
)
async def login(self, token: str | None = None, **kwargs) -> None:
+ """Authenticate with the API (delegates to connect)."""
await self.connect(token=token)
async def close(self) -> None:
+ """Close the underlying HTTP client."""
if self._client:
await self._client.aclose()
self._client = None
async def datasets(self, **kwargs) -> list[Dataset]:
+ """Return a list of pre-configured health datasets."""
from .databases import AVAILABLE_DATABASES
return [db_class(client=self) for db_class in AVAILABLE_DATABASES]
async def list_datasets(self, **kwargs) -> list[ConjuntoDados]:
+ """Search and list available datasets from the portal."""
if self._client is None:
raise ConnectionError(
"Client not connected. Call login(token=...) first.",
@@ -121,6 +136,7 @@ async def list_datasets(self, **kwargs) -> list[ConjuntoDados]:
return [ConjuntoDados(**item, client=self) for item in data]
async def get_dataset(self, id: str) -> ConjuntoDados:
+ """Fetch a single dataset by its ID."""
if self._client is None:
raise ConnectionError(
"Client not connected. Call login(token=...) first.",
@@ -140,6 +156,7 @@ async def _download_file(
output: pathlib.Path,
callback: Callable[[int], None] | None = None,
) -> pathlib.Path:
+ """Download a remote file to a local path."""
if self._client is None:
raise ConnectionError(
"Client not connected. Call login(token=...) first.",
@@ -156,6 +173,8 @@ async def _download_file(
class Recurso(BaseModel):
+ """A single resource (file) within a dataset on dados.gov.br."""
+
model_config = ConfigDict(populate_by_name=True)
id: str
@@ -168,6 +187,7 @@ class Recurso(BaseModel):
file_name: str | None = Field(None, alias="nomeArquivo")
async def get_size(self) -> int:
+ """Retrieve the file size from the remote server."""
async with httpx.AsyncClient(follow_redirects=True) as client:
response = await client.head(self.url)
@@ -182,6 +202,8 @@ async def get_size(self) -> int:
class ConjuntoDados(BaseModel):
+ """A dataset group as returned by the dados.gov.br API."""
+
model_config = ConfigDict(populate_by_name=True)
client: BaseRemoteClient | None = None
diff --git a/pysus/api/dadosgov/databases.py b/pysus/api/dadosgov/databases.py
index f8f033bb..cc456593 100644
--- a/pysus/api/dadosgov/databases.py
+++ b/pysus/api/dadosgov/databases.py
@@ -1,9 +1,13 @@
+"""Pre-configured health database definitions accessible via dados.gov.br."""
+
from typing import Any
from .models import Dataset
class CNES(Dataset):
+ """Cadastro Nacional de Estabelecimentos de Saúde (CNES)."""
+
ids: list[str] = [
"40a0d093-b12f-44a4-bdc7-bae8eb54dd04",
"9455b341-b06e-408e-8e10-54b32b3d74ec",
@@ -11,10 +15,12 @@ class CNES(Dataset):
@property
def name(self) -> str:
+ """Return the short name."""
return "CNES"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Cadastro Nacional de Estabelecimentos de Saúde"
@property
@@ -26,10 +32,13 @@ def description(self) -> str:
)
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
class PNI(Dataset):
+ """Programa Nacional de Imunizações (PNI)."""
+
ids: list[str] = [
"2989d396-cb09-47e7-a3b8-a4b951ca0200",
"543aa08a-46c4-44e8-802e-198daa30753d",
@@ -42,10 +51,12 @@ class PNI(Dataset):
@property
def name(self) -> str:
+ """Return the short name."""
return "PNI"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Programa Nacional de Imunizações"
@property
@@ -53,20 +64,25 @@ def description(self) -> str:
return "O PNI monitora a cobertura vacinal e doses aplicadas no Brasil."
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
class SIA(Dataset):
+ """Sistema de Informações Ambulatoriais (SIA)."""
+
ids: list[str] = [
"9a335cb7-2b4f-4fce-8947-e8441b4a90af",
]
@property
def name(self) -> str:
+ """Return the short name."""
return "SIA"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Sistema de Informações Ambulatoriais"
@property
@@ -76,10 +92,13 @@ def description(self) -> str:
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
class SINAN(Dataset):
+ """Sistema de Informação de Agravos de Notificação (SINAN)."""
+
ids: list[str] = [
"4d5e5d44-58a8-4d67-b8aa-4ef1e4b00a1c",
"5699abe0-0510-4da8-b47d-209b3bb32b34",
@@ -89,10 +108,12 @@ class SINAN(Dataset):
@property
def name(self) -> str:
+ """Return the short name."""
return "SINAN"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Sistema de Informação de Agravos de Notificação"
@property
@@ -103,20 +124,25 @@ def description(self) -> str:
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
class SIM(Dataset):
+ """Sistema de Informação sobre Mortalidade (SIM)."""
+
ids: list[str] = [
"5f121f4d-47c6-428e-8ec6-e8ec56417172",
]
@property
def name(self) -> str:
+ """Return the short name."""
return "SIM"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Sistema de Informação sobre Mortalidade"
@property
@@ -126,20 +152,25 @@ def description(self) -> str:
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
class SINASC(Dataset):
+ """Sistema de Informações sobre Nascidos Vivos (SINASC)."""
+
ids: list[str] = [
"441cc6bd-684a-4afd-a88b-ba4734c9e83e",
]
@property
def name(self) -> str:
+ """Return the short name."""
return "SINASC"
@property
def long_name(self) -> str:
+ """Return the human-readable name."""
return "Sistema de Informações sobre Nascidos Vivos"
@property
@@ -150,6 +181,7 @@ def description(self) -> str:
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract metadata from a filename (not yet implemented)."""
raise NotImplementedError()
diff --git a/pysus/api/dadosgov/models.py b/pysus/api/dadosgov/models.py
index 314ddaa4..bc763f4f 100644
--- a/pysus/api/dadosgov/models.py
+++ b/pysus/api/dadosgov/models.py
@@ -1,3 +1,5 @@
+"""Internal domain models for datasets, groups, and files from dados.gov.br."""
+
import asyncio
import pathlib
from abc import abstractmethod
@@ -19,11 +21,14 @@
class File(BaseRemoteFile):
+ """A downloadable file from a dados.gov.br dataset."""
+
record: Recurso
type: str = "File"
_metadata: dict[str, Any] = PrivateAttr(default_factory=dict)
def __init__(self, **data):
+ """Initialize the File with optional metadata."""
metadata = data.pop("_metadata", {})
super().__init__(**data)
self._metadata = metadata
@@ -33,6 +38,7 @@ def __repr__(self):
return self.basename
def model_post_init(self, __context: Any) -> None:
+ """Fetch remote metadata if size or modify date is missing."""
if not self.record.api_size or not self.record.last_modified:
try:
loop = asyncio.get_running_loop()
@@ -44,16 +50,19 @@ def model_post_init(self, __context: Any) -> None:
@property
def extension(self) -> str:
+ """Return the file extension."""
if self.record.file_name:
return pathlib.Path(self.record.file_name).suffix
return pathlib.Path(self.record.url.split("/")[-1].split("?")[0]).suffix
@property
def size(self) -> int:
+ """Return the file size in bytes."""
return self.record.api_size or 0
@property
def modify(self) -> datetime:
+ """Return the last modification date."""
m = self.record.last_modified
if not m:
raise ValueError("File requires a modify date")
@@ -61,17 +70,21 @@ def modify(self) -> datetime:
@property
def year(self) -> int | None:
+ """Return the inferred year from metadata."""
return self._metadata.get("year")
@property
def month(self) -> int | None:
+ """Return the inferred month from metadata."""
return self._metadata.get("month")
@property
def state(self) -> State | None:
+ """Return the inferred state from metadata."""
return self._metadata.get("state")
async def fetch_metadata(self) -> None:
+ """Fetch file size and last-modified from the remote server."""
try:
async with httpx.AsyncClient(
follow_redirects=True,
@@ -102,11 +115,13 @@ async def _download(
output: pathlib.Path | None = None,
callback: Callable[[int], None] | None = None,
) -> pathlib.Path:
+ """Download the file to a local path."""
if not output:
output = CACHEPATH / self.name
return await self.client._download_file(self, output, callback=callback)
async def fetch_size(self) -> int:
+ """Fetch the remote file size and update the local record."""
try:
async with httpx.AsyncClient(
follow_redirects=True,
@@ -130,6 +145,8 @@ async def fetch_size(self) -> int:
class Group(BaseRemoteGroup):
+ """A group of files within a dataset."""
+
record: ConjuntoDados
_formatter: (
Callable[
@@ -145,6 +162,7 @@ def __init__(
dataset: BaseRemoteDataset,
formatter: Callable | None = None,
):
+ """Initialize the Group with a dataset record and optional formatter."""
super().__init__(dataset=dataset)
self.record = record
self._formatter = formatter
@@ -154,17 +172,21 @@ def __repr__(self):
@property
def name(self) -> str:
+ """Return the group slug name."""
return self.record.slug
@property
def long_name(self) -> str:
+ """Return the group title."""
return self.record.title
@property
def description(self) -> str:
+ """Return an empty description."""
return ""
async def _fetch_files(self) -> list[BaseRemoteFile]:
+ """Build File objects from the underlying resources."""
files: list[BaseRemoteFile] = []
for recurso in self.record.resources:
metadata = self._formatter(recurso, self) if self._formatter else {}
@@ -179,6 +201,8 @@ async def _fetch_files(self) -> list[BaseRemoteFile]:
class Dataset(BaseRemoteDataset):
+ """A health dataset available through dados.gov.br."""
+
ids: list[str] = []
client: "DadosGov"
@@ -187,9 +211,11 @@ def __repr__(self):
@abstractmethod
def formatter(self, filename: str) -> dict[str, Any]:
+ """Extract structured metadata from a filename."""
pass
async def _fetch_content(self) -> list[Group]:
+ """Fetch all groups belonging to this dataset."""
items: list[Group] = []
client: "DadosGov" = self.client
if self.ids:
diff --git a/pysus/api/ducklake/__init__.py b/pysus/api/ducklake/__init__.py
index 2d2f5d16..4ba051d0 100644
--- a/pysus/api/ducklake/__init__.py
+++ b/pysus/api/ducklake/__init__.py
@@ -1 +1,7 @@
+"""DuckLake subpackage for interacting with the PySUS S3 catalog.
+
+Provides a DuckDB-based client for querying and downloading
+public health datasets stored in object storage.
+"""
+
from .client import DuckLake as DuckLakeClient # noqa
diff --git a/pysus/api/ducklake/catalog.py b/pysus/api/ducklake/catalog.py
index b6d7d06d..cc83ba3f 100644
--- a/pysus/api/ducklake/catalog.py
+++ b/pysus/api/ducklake/catalog.py
@@ -1,3 +1,9 @@
+"""SQLAlchemy ORM models for the DuckLake catalog schema.
+
+Defines tables for datasets, groups, files, and columns stored
+in the pysus schema of the local DuckDB catalog.
+"""
+
import enum
from datetime import datetime
from typing import Optional
@@ -18,6 +24,8 @@
class Base(DeclarativeBase):
+ """Base class for all DuckLake catalog ORM models."""
+
pass
@@ -41,16 +49,22 @@ class Base(DeclarativeBase):
class CatalogTable(Base):
+ """Abstract base for catalog tables sharing the pysus schema."""
+
__abstract__ = True
__table_args__: tuple = ({"schema": "pysus"},)
class Origin(enum.Enum):
+ """Origin type for a dataset: FTP or API."""
+
FTP = "ftp"
API = "api"
class CatalogDataset(CatalogTable):
+ """ORM model for the datasets table, representing a dataset collection."""
+
__tablename__ = "datasets"
id = Column(
@@ -81,6 +95,8 @@ class CatalogDataset(CatalogTable):
class ColumnDefinition(CatalogTable):
+ """ORM model for dataset column metadata (name, type, description)."""
+
__tablename__ = "dataset_columns"
id = Column(
@@ -113,6 +129,8 @@ class ColumnDefinition(CatalogTable):
class DatasetGroup(CatalogTable):
+ """ORM model for dataset groups, grouping related files within a dataset."""
+
__tablename__ = "dataset_groups"
id = Column(
@@ -144,6 +162,8 @@ class DatasetGroup(CatalogTable):
class CatalogFile(CatalogTable):
+ """ORM model for the files table, representing individual data files."""
+
__tablename__ = "files"
id: Mapped[int] = mapped_column(
diff --git a/pysus/api/ducklake/client.py b/pysus/api/ducklake/client.py
index 8dbb2523..469da9b2 100644
--- a/pysus/api/ducklake/client.py
+++ b/pysus/api/ducklake/client.py
@@ -1,3 +1,9 @@
+"""High-level client for DuckLake S3-based dataset catalog.
+
+Provides authentication, catalog synchronization, dataset querying,
+and file download capabilities backed by a local DuckDB engine.
+"""
+
from collections.abc import Callable
from pathlib import Path
from typing import Any
@@ -18,6 +24,8 @@
class CatalogDatasetAdapter:
+ """Adapter wrapping a CatalogDataset ORM record for use by File objects."""
+
def __init__(self, catalog_dataset: CatalogDataset, ducklake):
self.name = catalog_dataset.name
self.long_name = catalog_dataset.long_name or ""
@@ -28,10 +36,13 @@ def __init__(self, catalog_dataset: CatalogDataset, ducklake):
@property
def content(self):
+ """Query the DuckLake client for files in this dataset."""
return self.ducklake.query(dataset=self.name.upper())
class DatasetGroupAdapter:
+ """Adapter wrapping a DatasetGroup ORM record for use by File objects."""
+
def __init__(self, dataset_group: DatasetGroup, dataset):
self.name = dataset_group.name
self.long_name = dataset_group.long_name or ""
@@ -43,21 +54,28 @@ def __str__(self):
@property
async def files(self):
+ """Return the list of files in this group."""
return []
async def _fetch_files(self):
+ """Fetch files from the remote source for this group."""
return []
async def search(self, **kwargs):
+ """Search for files within this group matching the given criteria."""
return []
class DuckLakeCredentials(BaseModel):
+ """Credentials for authenticating with the S3-compatible object storage."""
+
access_key: SecretStr
secret_key: SecretStr
class DuckLake(BaseRemoteClient):
+ """Client for the DuckLake S3-based public health dataset catalog."""
+
endpoint: str = "nbg1.your-objectstorage.com"
region: str = "nbg1"
bucket: str = "pysus"
@@ -71,6 +89,7 @@ class DuckLake(BaseRemoteClient):
_Session: Any = PrivateAttr(default=None)
def __init__(self, engine=None, **data):
+ """Initialize the DuckLake client with an optional existing engine."""
super().__init__(**data)
self._engine = engine
self._cache_dir = Path(CACHEPATH) / "ducklake"
@@ -79,29 +98,36 @@ def __init__(self, engine=None, **data):
@property
def name(self) -> str:
+ """Return the short name of this client."""
return "DuckLake"
@property
def long_name(self) -> str:
+ """Return the human-readable name of this client."""
return "PySUS s3 Client"
@property
def description(self) -> str:
+ """Return a description of this client."""
return "" # TODO:
@property
def catalog_path(self) -> Path:
+ """Return the local path to the downloaded catalog database."""
return self._catalog_local
@property
def _catalog_url(self) -> str:
+ """Return the remote URL of the catalog database file."""
return f"https://{self.endpoint}/{self.bucket}/{self._catalog_remote}"
@property
def _is_authenticated(self) -> bool:
+ """Return whether the client has credentials configured."""
return self.credentials is not None
async def datasets(self, **kwargs) -> list[DuckDataset]:
+ """Return all datasets from the catalog as DuckDataset instances."""
if not self._Session:
await self.connect()
@@ -129,6 +155,7 @@ async def login(
secret_key: str | None = None,
**kwargs,
) -> None:
+ """Authenticate with S3 credentials and reconnect to the catalog."""
if access_key and secret_key:
self.credentials = DuckLakeCredentials(
access_key=SecretStr(access_key),
@@ -145,6 +172,7 @@ async def login(
)
def _setup_engine(self):
+ """Create and configure the DuckDB engine with S3 settings."""
engine = create_engine(
f"duckdb:///{self._catalog_local}",
poolclass=StaticPool,
@@ -188,6 +216,7 @@ def _setup_engine(self):
return engine
async def connect(self, force: bool = False):
+ """Connect to the catalog, downloading it first if necessary."""
if self._engine and not force:
if not self._Session:
self._Session = sessionmaker(bind=self._engine)
@@ -198,6 +227,7 @@ async def connect(self, force: bool = False):
self._Session = sessionmaker(bind=self._engine)
async def close(self):
+ """Dispose the engine, then upload the catalog if authenticated."""
if self._engine:
await to_thread.run_sync(self._engine.dispose)
@@ -215,6 +245,7 @@ async def _download_file(
output: Path,
callback: Callable[[int], None] | None = None,
) -> Path:
+ """Download a single file from object storage to the local path."""
if not isinstance(file, File):
raise ValueError("FTP File was not properly instantiated")
@@ -230,6 +261,7 @@ async def _download_file(
return output
async def _download_catalog(self, client: httpx.AsyncClient):
+ """Download the catalog database from remote storage with retries."""
max_retries = 5
for attempt in range(max_retries):
@@ -249,6 +281,7 @@ async def _download_catalog(self, client: httpx.AsyncClient):
raise e
def _get_s3_client(self):
+ """Create and return a boto3 S3 client for the configured endpoint."""
if not self.credentials:
raise ConnectionError("S3 Credentials not found")
return boto3.client(
@@ -263,6 +296,7 @@ def _get_s3_client(self):
)
async def _load_catalog(self):
+ """Download remote catalog if the local copy is outdated or missing."""
async with httpx.AsyncClient(follow_redirects=True) as client:
local_size = -1
if self._catalog_local.exists():
@@ -280,6 +314,7 @@ async def _load_catalog(self):
await self._download_catalog(client)
async def _upload_catalog(self):
+ """Upload the local catalog database to remote storage."""
if not self._is_authenticated:
raise PermissionError(
"Admin credentials required to upload catalog.",
@@ -302,6 +337,7 @@ async def query(
year: int | None = None,
month: int | None = None,
) -> list[File]:
+ """Query catalog files by dataset, group, state, year, and/or month."""
if not self._Session:
await self.connect()
diff --git a/pysus/api/ducklake/models.py b/pysus/api/ducklake/models.py
index bb4e8da6..527f2caa 100644
--- a/pysus/api/ducklake/models.py
+++ b/pysus/api/ducklake/models.py
@@ -1,3 +1,9 @@
+"""Application-level models for DuckLake remote resources.
+
+Wraps catalog ORM records into BaseRemoteFile, BaseRemoteDataset,
+and BaseRemoteGroup interfaces used by the rest of PySUS.
+"""
+
import hashlib
from collections.abc import Callable
from datetime import datetime
@@ -18,6 +24,8 @@
class File(BaseRemoteFile):
+ """A remote file in DuckLake catalog with download and verification."""
+
record: CatalogFile = Field(exclude=True)
type: str = "remote"
dataset: Any
@@ -25,26 +33,32 @@ class File(BaseRemoteFile):
@property
def basename(self) -> str:
+ """Return the file name without directory components."""
return self.path.name
@property
def extension(self) -> str:
+ """Return the file extension including the leading dot."""
return self.path.suffix
@property
def size(self) -> int:
+ """Return the file size in bytes."""
return self.record.size
@property
def modify(self) -> datetime:
+ """Return the last-modified timestamp."""
return self.record.modified
@property
def rows(self) -> int:
+ """Return the number of rows in the file."""
return self.record.rows
@property
def sha256(self) -> str | None:
+ """Return the SHA-256 hash of the file, if available."""
return self.record.sha256
async def _download(
@@ -52,6 +66,7 @@ async def _download(
output: Path | None = None,
callback: Callable[[int], None] | None = None,
) -> Path:
+ """Download the file from object storage to the given output path."""
if not output:
output = CACHEPATH / self.name
@@ -62,6 +77,7 @@ async def _download(
)
async def verify(self, path: Path) -> bool:
+ """Verify the file matches the recorded SHA-256 hash."""
if not self.sha256:
return True
@@ -77,6 +93,8 @@ def _calculate():
class DuckDataset(BaseRemoteDataset):
+ """A dataset from the DuckLake catalog, containing groups and files."""
+
record: CatalogDataset = Field(exclude=True)
client: BaseRemoteClient = Field(exclude=True)
@@ -85,10 +103,12 @@ def __repr__(self) -> str:
@property
def name(self) -> str:
+ """Return the short name of the dataset."""
return self.record.name
@property
def long_name(self) -> str:
+ """Return the human-readable name of the dataset."""
return (
self.record.dataset_metadata.long_name
if self.record.dataset_metadata
@@ -97,6 +117,7 @@ def long_name(self) -> str:
@property
def description(self) -> str:
+ """Return the description of the dataset."""
return (
self.record.dataset_metadata.description
if self.record.dataset_metadata
@@ -104,6 +125,7 @@ def description(self) -> str:
)
async def _fetch_content(self) -> list[Union["DuckGroup", File]]:
+ """Fetch groups and files belonging to this dataset."""
items: list[Union["DuckGroup", File]] = []
if self.record.groups:
@@ -127,15 +149,19 @@ async def _fetch_content(self) -> list[Union["DuckGroup", File]]:
class DuckGroup(BaseRemoteGroup):
+ """A group of related files within a DuckLake dataset."""
+
record: DatasetGroup = Field(exclude=True)
dataset: DuckDataset = Field(exclude=True)
@property
def name(self) -> str:
+ """Return the short name of the group."""
return self.record.name
@property
def long_name(self) -> str:
+ """Return the human-readable name of the group."""
return (
self.record.group_metadata.long_name
if self.record.group_metadata
@@ -144,11 +170,13 @@ def long_name(self) -> str:
@property
def description(self) -> str:
+ """Return the description of the group."""
if self.record.group_metadata:
return self.record.group_metadata.description
return ""
async def _fetch_files(self) -> list[BaseRemoteFile]:
+ """Fetch the list of files belonging to this group."""
files: list[BaseRemoteFile] = [
File(
path=f.path,
diff --git a/pysus/api/extensions.py b/pysus/api/extensions.py
index d4739e53..d1029049 100644
--- a/pysus/api/extensions.py
+++ b/pysus/api/extensions.py
@@ -1,3 +1,5 @@
+"""Map file extensions and MIME types to their handler classes."""
+
import asyncio
import csv
import ctypes.util
@@ -40,15 +42,20 @@
class File(BaseLocalFile):
+ """Represents a generic local file with no special handling."""
+
type: FileType = Field("FILE")
async def load(self) -> bytes:
+ """Read the entire file contents into memory as bytes."""
return await to_thread.run_sync(self.path.read_bytes)
async def stream(
self,
chunk_size: int = 1024 * 1024,
) -> AsyncGenerator[bytes, None]:
+ """Yield the file contents in chunks of the given size."""
+
def _read_sync():
with open(self.path, "rb") as f:
while chunk := f.read(chunk_size):
@@ -60,12 +67,16 @@ def _read_sync():
class Directory(BaseLocalFile):
+ """Represents a directory on the local filesystem."""
+
type: FileType = Field("DIR")
def __repr__(self) -> str:
+ """Return the directory name with a trailing slash."""
return f"{self.basename}/"
async def load(self) -> list[BaseLocalFile]:
+ """Load all entries inside the directory as file objects."""
from pysus.api.extensions import ExtensionFactory
if not self.path.exists():
@@ -79,6 +90,7 @@ async def stream(
self,
chunksize: int = 10000,
) -> AsyncGenerator[BaseLocalFile, None]:
+ """Yield each entry inside the directory as a file object."""
from pysus.api.extensions import ExtensionFactory
for p in self.path.iterdir():
@@ -86,17 +98,21 @@ async def stream(
class CSV(BaseTabularFile):
+ """Represents a CSV file with automatic encoding and separator detection."""
+
type: FileType = Field("CSV")
_encoding: str | None = PrivateAttr(default=None)
_sep: str | None = PrivateAttr(default=None)
@property
def columns(self) -> list[str]:
+ """Return the column names from the CSV header row."""
df = pd.read_csv(self.path, sep=",", nrows=0)
return df.columns.tolist()
@property
def rows(self) -> int:
+ """Return the number of data rows in the file."""
count = 0
with open(self.path, "rb") as f:
for _ in f:
@@ -104,6 +120,7 @@ def rows(self) -> int:
return max(0, count - 1)
async def _get_encoding(self) -> str:
+ """Detect and cache the file's character encoding."""
if self._encoding is None:
def detect():
@@ -115,6 +132,7 @@ def detect():
return self._encoding
async def _get_sep(self) -> str:
+ """Sniff and cache the CSV delimiter."""
if self._sep is None:
encoding = await self._get_encoding()
@@ -131,6 +149,7 @@ def sniff():
return self._sep
async def load(self) -> pd.DataFrame:
+ """Read the entire CSV into a DataFrame."""
encoding = await self._get_encoding()
separator = await self._get_sep()
@@ -145,6 +164,7 @@ async def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Yield the CSV in chunks of the given number of rows."""
encoding = await self._get_encoding()
separator = await self._get_sep()
@@ -165,21 +185,28 @@ def _get_reader_sync():
class Parquet(BaseTabularFile):
+ """Represents a Parquet file with optional date and integer type parsing."""
+
type: FileType = Field("PARQUET")
@property
def schema(self) -> pa.Schema:
+ """Return the Parquet schema as a PyArrow Schema object."""
return pq.read_schema(self.path)
@property
def columns(self) -> list[str]:
+ """Return the column names from the Parquet schema."""
return pq.read_schema(self.path).names
@property
def rows(self) -> int:
+ """Return the number of rows from the Parquet metadata."""
return pq.read_metadata(self.path).num_rows
async def load(self, parse: bool = True) -> pd.DataFrame:
+ """Read the entire Parquet file into a DataFrame."""
+
def _load():
df = pd.read_parquet(self.path, engine="pyarrow")
return self.parse_dftypes(df) if parse else df
@@ -189,6 +216,7 @@ def _load():
async def stream(
self, chunk_size: int = 10000, parse: bool = False
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Yield the Parquet file in batches of the given size."""
parquet_file = await to_thread.run_sync(pq.ParquetFile, self.path)
if parquet_file.metadata.num_row_groups == 0:
@@ -203,6 +231,8 @@ async def stream(
@staticmethod
def parse_dftypes(df: pd.DataFrame) -> pd.DataFrame:
+ """Convert known date and integer columns to their proper types."""
+
def str_to_int(string):
if pd.isna(string):
return string
@@ -232,17 +262,22 @@ def str_to_date(string):
class DBF(BaseTabularFile):
+ """Represents a dBASE (DBF) file."""
+
type: FileType = Field("DBF")
@property
def columns(self) -> list[str]:
+ """Return the field names from the DBF file."""
return DBFReader(self.path, load=False).field_names
@property
def rows(self) -> int:
+ """Return the number of records in the DBF file."""
return len(DBFReader(self.path, load=False))
def decode_column(self, value):
+ """Decode a byte string value using cp1252 encoding."""
if isinstance(value, bytes):
return (
value.decode(encoding="cp1252", errors="replace")
@@ -254,6 +289,8 @@ def decode_column(self, value):
return value
async def load(self) -> pd.DataFrame:
+ """Read the entire DBF file into a DataFrame."""
+
def _load():
dbf = DBFReader(self.path, encoding="cp1252", raw=True)
df = pd.DataFrame(iter(dbf))
@@ -265,6 +302,8 @@ async def stream(
self,
chunk_size: int = 30000,
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Yield the DBF records in chunks of the given size."""
+
def _get_db():
return DBFReader(self.path, encoding="cp1252", raw=True)
@@ -286,6 +325,7 @@ async def to_parquet(
chunk_size: int = 30000,
callback: Callable[[int, int], None] | None = None,
) -> "Parquet":
+ """Convert the DBF file to Parquet format."""
from pysus.api.extensions import ExtensionFactory
out = (
@@ -349,21 +389,26 @@ async def _stream_to_single_file():
class DBC(BaseTabularFile):
+ """Represents a compressed DBC file, convertible to DBF then Parquet."""
+
type: FileType = Field("DBC")
@property
def columns(self) -> list[str]:
+ """Not supported for DBC files. Convert to Parquet first."""
raise NotImplementedError(
"DBC metadata cannot be read directly. Convert to Parquet first."
)
@property
def rows(self) -> int:
+ """Not supported for DBC files. Convert to Parquet first."""
raise NotImplementedError(
"DBC metadata cannot be read directly. Convert to Parquet first."
)
async def load(self) -> pd.DataFrame:
+ """Convert to Parquet and load the result as a DataFrame."""
parquet = await self.to_parquet()
return await parquet.load()
@@ -371,6 +416,7 @@ async def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Convert to Parquet and stream its chunks."""
parquet = await self.to_parquet()
async for chunk in parquet.stream(chunk_size=chunk_size):
yield chunk
@@ -381,6 +427,7 @@ async def to_parquet(
chunk_size: int = 30000,
callback: Callable[[int, int], None] | None = None,
) -> "Parquet":
+ """Decompress DBC to DBF, then convert to Parquet."""
from pysus.api.extensions import ExtensionFactory
if output_path is None:
@@ -414,10 +461,13 @@ async def to_parquet(
class JSON(BaseTabularFile):
+ """Represents a JSON file with tabular data."""
+
type: FileType = Field("JSON")
@property
def columns(self) -> list[str]:
+ """Return the column names from the JSON file."""
df = (
pd.read_json(self.path, nrows=0)
if self.path.stat().st_size > 0
@@ -427,27 +477,35 @@ def columns(self) -> list[str]:
@property
def rows(self) -> int:
+ """Return the number of rows in the JSON file."""
return len(pd.read_json(self.path))
async def load(self) -> pd.DataFrame:
+ """Read the entire JSON file into a DataFrame."""
return await to_thread.run_sync(pd.read_json, self.path)
async def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Yield the entire JSON file as a single DataFrame."""
yield await self.load()
class PDF(BaseLocalFile):
+ """Represents a PDF file."""
+
type: FileType = Field("PDF")
async def load(self) -> bytes:
+ """Read the entire PDF file contents into memory as bytes."""
return await to_thread.run_sync(self.path.read_bytes)
async def stream(
self, chunk_size: int | None = None
) -> AsyncGenerator[bytes, None]:
+ """Yield the PDF file contents in chunks of the given size."""
+
def _read():
with open(self.path, "rb") as f:
if chunk_size:
@@ -462,12 +520,17 @@ def _read():
class Zip(BaseCompressedFile):
+ """Represents a ZIP archive file."""
+
type: FileType = Field("ZIP")
async def load(self) -> zipfile.ZipFile:
+ """Open and return the ZIP archive."""
return await to_thread.run_sync(zipfile.ZipFile, self.path)
async def list_members(self) -> list[str]:
+ """Return the list of member names inside the archive."""
+
def _list():
with zipfile.ZipFile(self.path) as z:
return z.namelist()
@@ -475,6 +538,8 @@ def _list():
return await to_thread.run_sync(_list)
async def open_member(self, member_name: str) -> bytes:
+ """Read and return the contents of a named archive member."""
+
def _read():
with zipfile.ZipFile(self.path) as z:
return z.read(member_name)
@@ -485,6 +550,7 @@ async def extract(
self,
target_dir: Path = CACHEPATH,
) -> list[BaseLocalFile]:
+ """Extract members to a target directory and return as file objects."""
from pysus.api.extensions import ExtensionFactory
target_dir = Path(target_dir).expanduser().resolve()
@@ -506,6 +572,7 @@ async def to_parquet(
chunk_size: int = 30000,
callback: Callable[[int, int], None] | None = None,
) -> "Parquet":
+ """Extract the archive and convert the first tabular file to Parquet."""
final_output = (
Path(output_path or self.path.with_suffix(".parquet"))
.expanduser()
@@ -535,6 +602,8 @@ async def to_parquet(
await self._safe_cleanup(temp_dir)
async def _safe_cleanup(self, directory: Path):
+ """Remove a temporary directory and its contents."""
+
def _cleanup():
if not directory.exists():
return
@@ -555,9 +624,13 @@ def _cleanup():
class GZip(BaseCompressedFile):
+ """Represents a GZip-compressed file."""
+
type: FileType = Field("ZIP")
async def load(self) -> bytes:
+ """Decompress and read the entire file contents into memory."""
+
def _read():
with gzip.open(self.path, "rb") as f:
return f.read()
@@ -565,15 +638,19 @@ def _read():
return await to_thread.run_sync(_read)
async def list_members(self) -> list[str]:
+ """Return a list containing the single decompressed file name."""
return [self.path.stem]
async def open_member(self, member_name: str) -> bytes:
+ """Read and return the decompressed file contents."""
return await self.load()
async def extract(
self,
target_dir: Path = CACHEPATH,
) -> list[BaseLocalFile]:
+ """Decompress the file to a target directory
+ and return it as a file object."""
from pysus.api.extensions import ExtensionFactory
target_dir.mkdir(parents=True, exist_ok=True)
@@ -594,12 +671,17 @@ def _decompress():
class Tar(BaseCompressedFile):
+ """Represents a Tar archive file."""
+
type: FileType = Field("ZIP")
async def load(self) -> tarfile.TarFile:
+ """Open and return the tar archive."""
return await to_thread.run_sync(tarfile.open, self.path)
async def list_members(self) -> list[str]:
+ """Return the list of member names inside the archive."""
+
def _list():
with tarfile.open(self.path) as t:
return t.getnames()
@@ -607,6 +689,8 @@ def _list():
return await to_thread.run_sync(_list)
async def open_member(self, member_name: str) -> bytes:
+ """Read and return the contents of a named archive member."""
+
def _read():
with tarfile.open(self.path) as t:
f = t.extractfile(member_name)
@@ -618,6 +702,7 @@ async def extract(
self,
target_dir: Path = CACHEPATH,
) -> list[BaseLocalFile]:
+ """Extract members to a target directory and return as file objects."""
from pysus.api.extensions import ExtensionFactory
target_dir.mkdir(parents=True, exist_ok=True)
@@ -633,6 +718,8 @@ def _extract():
class FTPNotImported(BaseTabularFile):
+ """Placeholder for DBC files when optional dependency is not installed."""
+
path: Path = Field(default_factory=lambda: Path("..."))
type: str | FileType = Field(default="remote")
import_err: ClassVar[
@@ -645,35 +732,44 @@ class FTPNotImported(BaseTabularFile):
@property
def name(self) -> str:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
@property
def extension(self) -> str:
+ """Return the .dbc extension."""
return ".dbc"
@property
def size(self) -> int:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
@property
def modify(self) -> datetime:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
@property
def columns(self) -> list[str]:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
@property
def rows(self) -> int:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
async def load(self) -> pd.DataFrame:
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[pd.DataFrame, None]:
+ """Raise ImportError indicating the missing DBC dependency."""
+
async def _internal_gen():
raise ImportError(self.import_err)
yield pd.DataFrame()
@@ -686,11 +782,13 @@ async def to_parquet(
chunk_size: int = 10000,
callback: Callable[[int, int], None] | None = None,
) -> Parquet:
-
+ """Raise ImportError indicating the missing DBC dependency."""
raise ImportError(self.import_err)
class ExtensionFactory:
+ """Factory that maps file extensions and MIME types to handler classes."""
+
_mime: dict[str, type[BaseLocalFile]] = {
"application/zip": Zip,
"application/x-gzip": GZip,
@@ -716,6 +814,7 @@ class ExtensionFactory:
@classmethod
async def _identify(cls, path: Path) -> type[BaseLocalFile] | None:
+ """Identify the file class by its MIME type."""
try:
mime = await to_thread.run_sync(
magic.from_file,
@@ -728,6 +827,7 @@ async def _identify(cls, path: Path) -> type[BaseLocalFile] | None:
@classmethod
async def get_file_class(cls, path: Path) -> type[BaseLocalFile]:
+ """Return handler class for path, falling back to extension matching."""
mime_class = await cls._identify(path)
if mime_class:
return mime_class
@@ -738,6 +838,7 @@ async def get_file_class(cls, path: Path) -> type[BaseLocalFile]:
@classmethod
async def instantiate(cls, path: str | Path) -> BaseLocalFile:
+ """Create and return the appropriate file handler for a given path."""
path = Path(path).expanduser().resolve()
if await to_thread.run_sync(path.is_dir):
return Directory(path=path, type="DIR")
diff --git a/pysus/api/ftp/__init__.py b/pysus/api/ftp/__init__.py
index d30edd44..f3138a4f 100644
--- a/pysus/api/ftp/__init__.py
+++ b/pysus/api/ftp/__init__.py
@@ -1 +1,3 @@
+"""FTP subpackage providing an async client for DATASUS datasets."""
+
from .client import FTP as FTPClient # noqa
diff --git a/pysus/api/ftp/client.py b/pysus/api/ftp/client.py
index e7d348ae..0329038b 100644
--- a/pysus/api/ftp/client.py
+++ b/pysus/api/ftp/client.py
@@ -1,3 +1,5 @@
+"""Async FTP client wrapping the standard ftplib for DATASUS data access."""
+
from __future__ import annotations
import pathlib
@@ -17,12 +19,16 @@
class FTPGroupInfo(TypedDict):
+ """Metadata describing a file group within a dataset."""
+
name: str
long_name: str | None
description: str | None
class FTPFileInfo(TypedDict):
+ """Parsed metadata for a file or directory entry from an FTP listing."""
+
name: str
size: int
type: str
@@ -34,20 +40,25 @@ class FTPFileInfo(TypedDict):
class FTP(BaseRemoteClient):
+ """Async FTP client for navigating and downloading DATASUS data."""
+
host: str = "ftp.datasus.gov.br"
_ftp: FTPLib | None = PrivateAttr(default=None)
@property
def name(self) -> str:
+ """Return the short name of this client."""
return "FTP"
@property
def long_name(self) -> str:
+ """Return the human-readable name of this client."""
return "Pysus FTP Client"
@property
def description(self) -> str:
+ """Return a description of this client's purpose."""
return """
O cliente FTP do pysus foi desenvolvido para fornecer uma interface
assíncrona e moderna para navegação e extração de dados diretamente
@@ -58,9 +69,12 @@ def description(self) -> str:
@property
def ftp(self) -> FTPLib | None:
+ """Return the underlying ftplib.FTP, or None if not connected."""
return self._ftp
async def connect(self) -> None:
+ """Establish the FTP connection to the remote host."""
+
def _connect():
if self.ftp is None:
self._ftp = FTPLib(self.host)
@@ -69,9 +83,12 @@ def _connect():
await to_thread.run_sync(_connect)
async def login(self, **kwargs) -> None:
+ """Authenticate and connect to the FTP server (alias for connect)."""
await self.connect()
async def close(self) -> None:
+ """Close the FTP connection and reset the internal client state."""
+
def _close():
if self.ftp:
try:
@@ -84,6 +101,7 @@ def _close():
await to_thread.run_sync(_close)
async def datasets(self, **kwargs) -> list[Dataset]:
+ """Return a list of all available dataset instances for this client."""
from .databases import AVAILABLE_DATABASES
if self.ftp is None:
@@ -100,6 +118,8 @@ async def _download_file(
output: pathlib.Path,
callback: Callable[..., None] | None = None,
) -> pathlib.Path:
+ """Download a remote file locally, optionally reporting progress."""
+
async def _fetch():
try:
self.ftp.voidcmd("NOOP")
@@ -128,6 +148,7 @@ def _line_parser(
file_line: str,
formatter: Callable[[str], dict[str, Any]] | None = None,
) -> FTPFileInfo:
+ """Parse a line from a DATASUS FTP LIST response into FTPFileInfo."""
parts = file_line.strip().split()
if len(parts) < 4:
raise ValueError(f"Invalid FTP line: {file_line}")
@@ -165,6 +186,8 @@ async def _list_directory(
path: str,
formatter: Callable[[str], dict[str, Any]] | None = None,
) -> list[FTPFileInfo]:
+ """List the contents of a remote directory and parse each entry."""
+
def _list():
self.ftp.cwd(path)
lines = []
diff --git a/pysus/api/ftp/databases.py b/pysus/api/ftp/databases.py
index 5b042dad..c08b09de 100644
--- a/pysus/api/ftp/databases.py
+++ b/pysus/api/ftp/databases.py
@@ -1,3 +1,5 @@
+"""DATASUS FTP dataset definitions with filename parsers for each database."""
+
from typing import Any
from pysus.api.ftp.models import Dataset, Directory
@@ -5,6 +7,8 @@
class CIHA(Dataset):
+ """Comunicação de Internação Hospitalar e Ambulatorial (CIHA)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/CIHA/201101_/Dados"),
]
@@ -15,14 +19,17 @@ class CIHA(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "CIHA"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Comunicação de Internação Hospitalar e Ambulatorial"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return (
"A CIHA foi criada para ampliar o processo de planejamento, "
"programação, controle, avaliação e regulação da assistência à "
@@ -31,6 +38,7 @@ def description(self) -> str:
)
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a CIHA filename into group, state, year and month metadata."""
try:
name = filename.split(".")[0].upper()
group_code = name[:4]
@@ -57,6 +65,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class CNES(Dataset):
+ """Cadastro Nacional de Estabelecimentos de Saúde (CNES)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/CNES/200508_/Dados"),
]
@@ -78,14 +88,17 @@ class CNES(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "CNES"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Cadastro Nacional de Estabelecimentos de Saúde"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return (
"O Cadastro Nacional de Estabelecimentos de Saúde (CNES) é o "
"sistema de informação oficial de cadastramento de informações "
@@ -93,6 +106,7 @@ def description(self) -> str:
)
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a CNES filename into group, state, year and month metadata."""
try:
name = filename.split(".")[0].upper()
group_code = name[:2]
@@ -118,6 +132,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class SINASC(Dataset):
+ """Sistema de Informações sobre Nascidos Vivos (SINASC)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/SINASC/NOV/DNRES"),
Directory("/dissemin/publicos/SINASC/ANT/DNRES"),
@@ -129,20 +145,24 @@ class SINASC(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "SINASC"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Sistema de Informações sobre Nascidos Vivos"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return """
O SINASC fornece subsídios para o diagnóstico de saúde e
planejamento de políticas.
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a SINASC filename into group, state and year metadata."""
try:
name = filename.split(".")[0].upper()
year_short = name[-2:]
@@ -162,6 +182,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class SIM(Dataset):
+ """Sistema de Informação sobre Mortalidade (SIM)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/SIM/CID10/DORES"),
Directory("/dissemin/publicos/SIM/CID9/DORES"),
@@ -173,17 +195,21 @@ class SIM(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "SIM"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Sistema de Informação sobre Mortalidade"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return "O SIM coleta dados sobre obitos no pais para analise epidemiologica." # noqa
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a SIM filename into group, state and year metadata."""
try:
name = filename.split(".")[0].upper()
if "CID9" in filename:
@@ -204,6 +230,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class PNI(Dataset):
+ """Programa Nacional de Imunizações (PNI)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/PNI/DADOS"),
]
@@ -214,17 +242,21 @@ class PNI(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "PNI"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Programa Nacional de Imunizações"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return "O SI-PNI monitora a cobertura vacinal e doses aplicadas."
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a PNI filename into group, state and year metadata."""
try:
name = filename.split(".")[0].upper()
group_code, state, year_short = name[:4], name[4:6], name[-2:]
@@ -242,6 +274,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class IBGEDATASUS(Dataset):
+ """População Residente e Projeções (IBGE)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/IBGE/POP"),
Directory("/dissemin/publicos/IBGE/censo"),
@@ -262,17 +296,21 @@ class IBGEDATASUS(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "IBGE"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "População Residente e Projeções (IBGE)"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return "Informações sobre a população residente obtidas de Censos."
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse an IBGE filename into group and year metadata."""
try:
name = filename.split(".")[0].upper()
year = name[-2:]
@@ -295,6 +333,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class SIA(Dataset):
+ """Sistema de Informações Ambulatoriais — outpatient information system."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/SIASUS/199407_200712/Dados"),
Directory("/dissemin/publicos/SIASUS/200801_/Dados"),
@@ -311,17 +351,21 @@ class SIA(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "SIA"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Sistema de Informações Ambulatoriais"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return "O SIA acompanha as ações de saúde produzidas."
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse an SIA filename into group, state, year and month metadata."""
try:
name = filename.split(".")[0].upper()
digits = "".join([d for d in name if d.isdigit()])
@@ -350,6 +394,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class SIH(Dataset):
+ """Sistema de Informações Hospitalares (SIH)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/SIHSUS/199201_200712/Dados"),
Directory("/dissemin/publicos/SIHSUS/200801_/Dados"),
@@ -363,19 +409,23 @@ class SIH(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "SIH"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Sistema de Informações Hospitalares"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return """
O SIH processa as internações hospitalares financiadas pelo SUS.
"""
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse an SIH filename into group, state, year and month metadata."""
try:
name = filename.split(".")[0].upper()
group_code = name[:2]
@@ -397,6 +447,8 @@ def formatter(self, filename: str) -> dict[str, Any]:
class SINAN(Dataset):
+ """Sistema de Informação de Agravos de Notificação (SINAN)."""
+
paths: list[Directory] = [
Directory("/dissemin/publicos/SINAN/DADOS/FINAIS"),
Directory("/dissemin/publicos/SINAN/DADOS/PRELIM"),
@@ -456,17 +508,21 @@ class SINAN(Dataset):
@property
def name(self) -> str:
+ """Return the dataset short name."""
return "SINAN"
@property
def long_name(self) -> str:
+ """Return the dataset full name in Portuguese."""
return "Sistema de Informação de Agravos de Notificação"
@property
def description(self) -> str:
+ """Return a description of the dataset's purpose."""
return "O SINAN é alimentado pela notificação de doenças compulsórias."
def formatter(self, filename: str) -> dict[str, Any]:
+ """Parse a SINAN filename into group and year metadata."""
try:
name = filename.split(".")[0].upper()
year_short = name[-2:]
diff --git a/pysus/api/ftp/models.py b/pysus/api/ftp/models.py
index 0f5ec5dd..4efb0349 100644
--- a/pysus/api/ftp/models.py
+++ b/pysus/api/ftp/models.py
@@ -1,3 +1,5 @@
+"""Data model classes for FTP directories, files, groups and datasets."""
+
from __future__ import annotations
import os
@@ -21,9 +23,12 @@
class File(BaseRemoteFile):
+ """A single file on the DATASUS FTP server with parsed metadata."""
+
_info: FTPFileInfo = PrivateAttr()
def __init__(self, **data):
+ """Initialise the File with raw FTP metadata."""
info = data.pop("_info", None)
if "path" not in data and info and "path" in info:
data["path"] = info["path"]
@@ -42,18 +47,22 @@ def __init__(self, **data):
)
def __repr__(self) -> str:
+ """Return the file name as its string representation."""
return self.name
@property
def extension(self) -> str:
+ """Return the file extension (e.g. .dbc, .dbf)."""
return Path(self.path).suffix
@property
def size(self) -> int:
+ """Return the file size in bytes."""
return self._info.get("size", 0)
@property
def modify(self) -> datetime:
+ """Return the last modification timestamp."""
m = self._info.get("modify")
if not m:
raise ValueError("File requires a modify date")
@@ -61,14 +70,17 @@ def modify(self) -> datetime:
@property
def year(self) -> int | None:
+ """Return the data year extracted from the filename, if available."""
return self._info.get("year")
@property
def month(self) -> int | None:
+ """Return the data month extracted from the filename, if available."""
return self._info.get("month")
@property
def state(self) -> State | None:
+ """Return the state code extracted from the filename, if available."""
return self._info.get("state", None)
async def _download(
@@ -76,6 +88,7 @@ async def _download(
output: Path | None = None,
callback: Callable[[int], None] | None = None,
) -> Path:
+ """Download this file to a local path, optionally reporting progress."""
if output is None:
cache_dir = Path(CACHEPATH)
cache_dir.mkdir(parents=True, exist_ok=True)
@@ -85,6 +98,8 @@ async def _download(
class Directory:
+ """A remote FTP directory lazily loaded into files and subdirectories."""
+
def __init__(
self,
path: str,
@@ -93,6 +108,7 @@ def __init__(
formatter: Callable | None = None,
dataset: Dataset | None = None,
):
+ """Initialise the Directory with a remote path and optional context."""
self.path = os.path.normpath(path)
self.parent = parent
self.dataset = dataset or getattr(parent, "dataset", None)
@@ -104,11 +120,13 @@ def __init__(
@property
async def content(self) -> list[Directory | File]:
+ """Return the directory contents, loading from FTP if not yet cached."""
if not self.loaded:
await self.load()
return self._content
async def load(self) -> None:
+ """Fetch and parse the directory listing from the FTP server."""
if not isinstance(self.client, FTP):
raise ValueError("no ftp client found")
raw_infos = await self.client._list_directory(
@@ -144,13 +162,17 @@ async def load(self) -> None:
self.loaded = True
def __str__(self) -> str:
+ """Return the normalised directory path."""
return self.path
def __repr__(self) -> str:
+ """Return a debug representation of this directory."""
return f""
class Group(BaseRemoteGroup):
+ """A group of related files within a dataset (e.g. all files of a type)."""
+
path: str
_name: str = PrivateAttr()
_long_name: str = PrivateAttr()
@@ -166,6 +188,7 @@ def __init__(
description: str = "",
**data: Any,
):
+ """Initialise the Group with metadata and a directory reference."""
data.update({"dataset": dataset, "path": path})
super().__init__(**data)
@@ -182,51 +205,59 @@ def __init__(
@property
def name(self) -> str:
+ """Return the group short code (e.g. 'RD', 'PA')."""
return self._name
@property
def long_name(self) -> str:
+ """Return the human-readable group name."""
return self._long_name
@property
def description(self) -> str:
+ """Return the group description."""
return self._description
@property
async def content(self) -> list[Directory | File]:
+ """Return the contents of the underlying directory."""
return await self._dir.content
async def _fetch_files(self) -> list[BaseRemoteFile]:
+ """Return only the file entries from this group's directory."""
items = await self.content
return [item for item in items if isinstance(item, BaseRemoteFile)]
class Dataset(BaseRemoteDataset, ABC):
+ """Abstract base for a DATASUS dataset, providing file discovery via FTP."""
+
paths: list[Directory] = []
group_definitions: dict[str, str] = {}
@property
@abstractmethod
def name(self) -> str:
- pass
+ """Return the dataset short name."""
@property
@abstractmethod
def long_name(self) -> str:
- pass
+ """Return the dataset full name in Portuguese."""
@property
@abstractmethod
def description(self) -> str:
- pass
+ """Return a description of the dataset's purpose."""
@abstractmethod
def formatter(self, filename: str) -> dict[str, Any]:
- pass
+ """Parse a filename into metadata (group, state, year, etc.)."""
async def _fetch_content(
self,
) -> Sequence[BaseRemoteGroup | BaseRemoteFile]:
+ """Walk the dataset's root directories and return groups and files."""
results: list[BaseRemoteGroup | BaseRemoteFile] = []
for root_dir in self.paths:
@@ -258,4 +289,5 @@ async def _fetch_content(
return results
def __repr__(self) -> str:
+ """Return the dataset short name as its string representation."""
return self.name
diff --git a/pysus/api/models.py b/pysus/api/models.py
index aa7bbefe..e3771391 100644
--- a/pysus/api/models.py
+++ b/pysus/api/models.py
@@ -1,3 +1,12 @@
+"""Abstract model hierarchy for PySUS data access.
+
+Provides abstract base classes for local and remote file handling, organized
+in a layered hierarchy: BaseFile -> BaseLocalFile -> BaseTabularFile /
+BaseCompressedFile for local files, and BaseFile -> BaseRemoteFile for remote
+files, alongside BaseRemoteObject -> BaseRemoteGroup / BaseRemoteDataset /
+BaseRemoteClient for remote data catalogs.
+"""
+
from __future__ import annotations
import asyncio
@@ -23,6 +32,11 @@
class BaseFile(BaseModel, ABC):
+ """Abstract base for a single file, local or remote.
+
+ Subclasses must implement *name*, *extension*, *size*, and *modify*.
+ """
+
model_config = ConfigDict(
arbitrary_types_allowed=True,
validate_assignment=True,
@@ -34,10 +48,11 @@ class BaseFile(BaseModel, ABC):
@property
@abstractmethod
def name(self) -> str:
- pass
+ """Return the display name of the file."""
@property
def basename(self) -> str:
+ """Return the file name from the path."""
return self.path.name
def __str__(self) -> str:
@@ -46,20 +61,25 @@ def __str__(self) -> str:
@property
@abstractmethod
def extension(self) -> str:
- pass
+ """Return the file extension string."""
@property
@abstractmethod
def size(self) -> int:
- pass
+ """Return the file size in bytes."""
@property
@abstractmethod
def modify(self) -> datetime:
- pass
+ """Return the last modification timestamp."""
class BaseLocalFile(BaseFile, ABC):
+ """Abstract base for a file stored on the local filesystem.
+
+ Subclasses must implement *load* and *stream*.
+ """
+
path: Path
@property
@@ -69,6 +89,13 @@ def name(self) -> str:
async def get_hash(
self, algorithm: str = "sha256", chunk_size: int = 1024 * 1024
) -> str:
+ """Compute the file's hash digest.
+
+ *algorithm* is the hash algorithm name (default "sha256").
+ *chunk_size* is the read chunk size in bytes.
+ Return the hex digest string.
+ """
+
def _compute_hash():
hash_obj = hashlib.new(algorithm)
with open(self.path, "rb") as f:
@@ -80,14 +107,14 @@ def _compute_hash():
@abstractmethod
async def load(self) -> Any:
- pass
+ """Load the entire file content into memory and return it."""
@abstractmethod
def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[Any, None]:
- pass
+ """Yield chunks of the file content as an async generator."""
@property
def extension(self) -> str:
@@ -103,26 +130,31 @@ def modify(self) -> datetime:
class BaseTabularFile(BaseLocalFile, ABC):
+ """Abstract base for a local tabular file (e.g. CSV, Parquet).
+
+ Subclasses must implement *columns*, *rows*, *load*, and *stream*.
+ """
+
@property
@abstractmethod
def columns(self) -> list[str]:
- pass
+ """Return the list of column names."""
@property
@abstractmethod
def rows(self) -> int:
- pass
+ """Return the number of data rows."""
@abstractmethod
async def load(self) -> pd.DataFrame:
- pass
+ """Load the entire file into a pandas DataFrame."""
@abstractmethod
def stream(
self,
chunk_size: int = 10000,
) -> AsyncGenerator[pd.DataFrame, None]:
- pass
+ """Yield pandas DataFrames in chunks as an async generator."""
async def to_parquet(
self,
@@ -130,6 +162,13 @@ async def to_parquet(
chunk_size: int = 10000,
callback: Callable[[int, int], None] | None = None,
) -> Parquet:
+ """Convert the file to Parquet format.
+
+ *output_path* is the destination path; defaults to the source path
+ with a .parquet extension. *chunk_size* controls the streaming chunk
+ size. *callback* receives (current_rows, total_rows) after each chunk.
+ Return the resulting Parquet wrapper object.
+ """
from pysus.api.extensions import ExtensionFactory, Parquet
if output_path is None:
@@ -187,25 +226,31 @@ async def to_parquet(
class BaseCompressedFile(BaseLocalFile, ABC):
+ """Abstract base for a compressed archive file (e.g. .zip, .gz).
+
+ Subclasses must implement *list_members*, *open_member*, and *extract*.
+ """
+
@abstractmethod
async def list_members(self) -> list[str]:
- pass
+ """Return the list of member names inside the archive."""
@abstractmethod
async def open_member(self, member_name: str) -> Any:
- pass
+ """Open and return a single archive member by name."""
@abstractmethod
async def extract(
self,
target_dir: Path = CACHEPATH,
) -> list[BaseLocalFile]:
- pass
+ """Extract all members into *target_dir* and return the file objects."""
async def stream(
self,
chunk_size: int | None = None,
) -> AsyncGenerator[Any, None]:
+ """Yield each archive member as it is opened."""
members = await self.list_members()
for member in members:
yield await self.open_member(member)
@@ -213,7 +258,10 @@ async def stream(
class SearchableMixin:
+ """Mixin providing attribute-based filtering for remote objects."""
+
def _matches(self, obj: Any, **kwargs) -> bool:
+ """Return True if all *kwargs* attributes match on *obj*."""
for key, value in kwargs.items():
obj_value = getattr(obj, key, None)
if obj_value != value:
@@ -222,6 +270,12 @@ def _matches(self, obj: Any, **kwargs) -> bool:
class BaseRemoteFile(BaseFile, SearchableMixin, ABC):
+ """Abstract base for a file stored on a remote server.
+
+ Subclasses must implement *_download*. *dataset* and *group* link back
+ to the containing objects.
+ """
+
dataset: BaseRemoteDataset = Field(exclude=True)
group: BaseRemoteGroup | None = Field(default=None, exclude=True)
@@ -231,18 +285,22 @@ def name(self) -> str:
@property
def client(self) -> BaseRemoteClient:
+ """Return the remote client associated with this file."""
return self.dataset.client
@property
def year(self) -> int | None:
+ """Return the year associated with the file, or None."""
return None
@property
def month(self) -> int | None:
+ """Return the month associated with the file, or None."""
return None
@property
def state(self) -> State | None:
+ """Return the state associated with the file, or None."""
return None
@abstractmethod
@@ -251,13 +309,20 @@ async def _download(
output: Path | None = None,
callback: Callable[[int], None] | None = None,
) -> Path:
- pass
+ """Download the file to *output* and return the local path.
+
+ Subclasses implement the actual transfer logic.
+ """
async def download(
self,
output: str | Path | None = None,
callback: Callable[[int], None] | None = None,
) -> BaseLocalFile:
+ """Download the remote file to a local cache or *output* path.
+
+ Return the instantiated local file wrapper.
+ """
from pysus.api.extensions import ExtensionFactory
if output is None:
@@ -279,6 +344,11 @@ async def download(
class BaseRemoteObject(BaseModel, ABC):
+ """Abstract base for a named remote entity with a description.
+
+ Subclasses must implement *name*, *long_name*, and *description*.
+ """
+
model_config = ConfigDict(arbitrary_types_allowed=True)
def __str__(self) -> str:
@@ -287,38 +357,49 @@ def __str__(self) -> str:
@property
@abstractmethod
def name(self) -> str:
- pass
+ """Return the short name of the remote entity."""
@property
@abstractmethod
def long_name(self) -> str:
- pass
+ """Return the long / human-readable name."""
@property
@abstractmethod
def description(self) -> str:
- pass
+ """Return a textual description of the entity."""
class BaseRemoteGroup(BaseRemoteObject, SearchableMixin, ABC):
+ """Abstract base for a named group of remote files within a dataset.
+
+ Subclasses must implement *_fetch_files*.
+ """
+
dataset: BaseRemoteDataset = Field(exclude=True)
_files: list[BaseRemoteFile] | None = PrivateAttr(default=None)
@property
def parent(self) -> BaseRemoteDataset:
+ """Return the parent dataset."""
return self.dataset
@abstractmethod
async def _fetch_files(self) -> list[BaseRemoteFile]:
- pass
+ """Fetch and return the list of files in this group."""
@property
async def files(self) -> list[BaseRemoteFile]:
+ """Return all files in this group, fetching them on first access."""
if self._files is None:
self._files = await self._fetch_files()
return self._files
async def search(self, **kwargs) -> list[BaseRemoteFile]:
+ """Filter files in this group by attribute *kwargs*.
+
+ Return matching file objects.
+ """
all_files = await self.files
if not kwargs:
return all_files
@@ -326,6 +407,11 @@ async def search(self, **kwargs) -> list[BaseRemoteFile]:
class BaseRemoteDataset(BaseRemoteObject, SearchableMixin, ABC):
+ """Abstract base for a dataset containing groups and/or files.
+
+ Subclasses must implement *_fetch_content*.
+ """
+
client: BaseRemoteClient = Field(exclude=True)
group_definitions: dict[str, str] = {}
_content: Sequence[BaseRemoteGroup | BaseRemoteFile] | None = PrivateAttr(
@@ -336,18 +422,23 @@ class BaseRemoteDataset(BaseRemoteObject, SearchableMixin, ABC):
async def _fetch_content(
self,
) -> Sequence[BaseRemoteGroup | BaseRemoteFile]:
- pass
+ """Fetch and return the top-level content (groups and files)."""
@property
async def content(
self,
) -> Sequence[BaseRemoteGroup | BaseRemoteFile]:
+ """Return the dataset content, fetching on first access."""
if self._content is None:
self._content = await self._fetch_content()
return self._content
async def search(self, **kwargs) -> list[BaseRemoteFile]:
+ """Recursively search groups and files by attribute *kwargs*.
+
+ Return matching file objects.
+ """
contents = await self.content
matches = []
@@ -363,21 +454,27 @@ async def search(self, **kwargs) -> list[BaseRemoteFile]:
class BaseRemoteClient(BaseRemoteObject, ABC):
+ """Abstract base for a remote API client (e.g. FTP, HTTP).
+
+ Subclasses must implement *connect*, *close*, *login*, *datasets*, and
+ *_download_file*.
+ """
+
@abstractmethod
async def connect(self) -> None:
- pass
+ """Establish a connection to the remote server."""
@abstractmethod
async def close(self) -> None:
- pass
+ """Close the connection to the remote server."""
@abstractmethod
async def login(self, **kwargs) -> None:
- pass
+ """Authenticate with the remote server using *kwargs* credentials."""
@abstractmethod
async def datasets(self, **kwargs) -> list:
- pass
+ """Return a list of available datasets matching *kwargs*."""
@abstractmethod
async def _download_file(
@@ -386,4 +483,4 @@ async def _download_file(
output: Path,
callback: Callable[[int], None] | None = None,
) -> Path:
- pass
+ """Download a single *file* to *output* and return the local path."""
diff --git a/pysus/api/types.py b/pysus/api/types.py
index 0f78d208..2ed8f95e 100644
--- a/pysus/api/types.py
+++ b/pysus/api/types.py
@@ -1,3 +1,13 @@
+"""Type aliases used across the PySUS API.
+
+FileType:
+ Discriminated union of supported local file types
+ (FILE, DIR, PARQUET, CSV, JSON, PDF, DBC, DBF, ZIP).
+
+State:
+ Brazilian state abbreviations (AC, AL, AP, ..., DF).
+"""
+
from typing import Literal
FileType = Literal[