diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 64c0ea6..82983d2 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.25 +current_version = 1.0.37 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))? @@ -14,14 +14,14 @@ serialize = [bumpverion:part:build] values = [0-9A-Za-z-]+ -[bumpversion:file:setup.cfg] -search = version = {current_version} -replace = version = {new_version} +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" [bumpversion:file:mkdocs.yml] search = version: {current_version} replace = version: {new_version} [bumpversion:file:ebel/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' +search = __version__ = "{current_version}" +replace = __version__ = "{new_version}" diff --git a/.readthedocs.yml b/.readthedocs.yml index 678db21..9eb4eef 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,14 +1,21 @@ # See: https://docs.readthedocs.io/en/latest/yaml-config.html version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.10" + jobs: + pre_create_environment: + - echo "Creating environment" + post_build: + - echo "Build finished successfully" + - echo `date` + python: - version: "3.7" install: - requirements: docs/requirements.txt - - method: pip - path: . - extra_requirements: - - requirements.txt + - requirements: requirements.txt mkdocs: configuration: mkdocs.yml diff --git a/README.rst b/README.rst index 312b88e..6bc0dc5 100644 --- a/README.rst +++ b/README.rst @@ -5,7 +5,7 @@ e(BE:L) is a Python package built for both validating and modeling information e This software package serves a comprehensive tool for all of your BEL needs and serves to create enriched knowledge graphs for developing and testing new theories and hypotheses. -**e(BE:L)** have implemented several other knowledge bases to extend the BEL knowledge graph or map identifiers. +**e(BE:L)** has implemented several other knowledge bases to extend the BEL knowledge graph or map identifiers: * `BioGrid `_ * `ChEBI `_ @@ -44,6 +44,53 @@ But we want to encourage you to use the latest development version which can be $ pip install git+https://github.com/e-bel/ebel + +Docker Installation +------------------- + +Make sure `docker `_ and `docker-compose `_ are installed. + +.. code-block:: + + docker-compose up --build -d + docker exec -it ebel_ebel ebel settings + +Several question will follow. You can accept the default values (just press RETURN) except the following questions: + +.. code-block:: + + OrientDB server [localhost] ? + ebel_orientdb + OrientDB root password (to create database and users) + ebel + MySQL/MariaDB sever name [localhost] + ebel_mysql + MySQL root password (will be not stored) to create database and user + ebel + +It's strongly recommended, if you are using ebel in the production environment, to change the +standard root MySQL and OrientDB passwords in the docker-compose.yml file. + +To load example files in container and import. + +.. code-block:: + + docker exec -it ebel_ebel git clone https://github.com/e-bel/example_json_bel_files.git + docker exec -it ebel_ebel ebel import-json example_json_bel_files/phago.bel.json -e + + +To enrich the network: + +.. code-block:: + + docker exec -it ebel_ebel ebel enrich + +Following services are now available: + +1. `OrientDB Studio `_ +2. `e(BE:L) REST server `_ +3. `phpMyAdmin `_ + Package Requirements ==================== @@ -235,51 +282,6 @@ To specify a different graph database to connect to than the one in your config # To overwrite your default values in the config file bel = Bel(graph_config=config_params, overwrite_config=True) -Docker installation -=================== - -Make sure `docker `_ and `docker-compose `_ are installed. - -.. code-block:: - - docker-compose up --build -d - docker exec -it ebel_ebel ebel settings - -Several question will follow. You can accept the default values (just press RETURN) except the following questions: - -.. code-block:: - - OrientDB server [localhost] ? - ebel_orientdb - OrientDB root password (to create database and users) - ebel - MySQL/MariaDB sever name [localhost] - ebel_mysql - MySQL root password (will be not stored) to create database and user - ebel - -It's strongly recommended, if you are using ebel in the production environment, to change the -standard root MySQL and OrientDB passwords in the docker-compose.yml file. - -To load example files in container and import. - -.. code-block:: - - docker exec -it ebel_ebel git clone https://github.com/e-bel/example_json_bel_files.git - docker exec -it ebel_ebel ebel ebel import-json example_json_bel_files/phago.json -e - - -To enrich the network: - -.. code-block:: - - docker exec -it ebel_ebel ebel enrich - -Following services are now available: - -1. `OrientDB Studio `_ -2. `e(BE:L) REST server `_ -3. `phpMyAdmin `_ API === diff --git a/docker-compose.yml b/docker-compose.yml index 563bf72..d8c2d80 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,8 +5,6 @@ services: container_name: ebel_ebel ports: - 5000:5000 - networks: - - ebel_network depends_on: - mysql - orientdb @@ -23,8 +21,6 @@ services: MYSQL_USER: ebel MYSQL_PASSWORD: ebel MYSQL_DATABASE: ebel - networks: - - ebel_network volumes: - ebel_mysql:/var/lib/mysql orientdb: @@ -36,8 +32,6 @@ services: ports: - 2424:2424 - 2480:2480 - networks: - - ebel_network volumes: - ebel_orientdb_db:/orientdb/databases - ebel_orientdb_backup:/orientdb/backup @@ -48,13 +42,10 @@ services: container_name: ebel_phpmyadmin ports: - 8089:80 - networks: - - ebel_network environment: - PMA_ARBITRARY=1 -networks: - ebel_network: - driver: bridge + - PMA_HOST=mysql + volumes: ebel_orientdb_db: ebel_orientdb_backup: diff --git a/docs/index.md b/docs/index.md index 520b03b..6da0bcc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1,222 @@ # Welcome to the e(BE:L) Documentation! +## e(BE:L) +e(BE:L) is a Python package built for both validating and modeling information extracted from publications using `Biological Expression Language (BEL) `_. +This software package serves a comprehensive tool for all of your BEL needs and serves to create enriched knowledge graphs +for developing and testing new theories and hypotheses. + +*e(BE:L)* have implemented several other knowledge bases to extend the BEL knowledge graph or map identifiers. + +* [BioGrid](https://thebiogrid.org/) +* [ChEBI](https://www.ebi.ac.uk/chebi/) +* [ClinicalTrials.gov](https://clinicaltrials.gov/) +* [ClinVar](https://www.ncbi.nlm.nih.gov/clinvar/) +* [DisGeNET](https://www.disgenet.org/) +* [DrugBank](https://go.drugbank.com/) +* [Ensembl]() +* [Expression Atlas](https://www.ebi.ac.uk/gxa/home) +* [GWAS Catalog](https://www.ebi.ac.uk/gxa/home) +* [HGNC](https://www.genenames.org/) +* [IntAct]() +* [Guide to PHARMACOLOGY](https://www.guidetopharmacology.org/) +* [KEGG](https://www.genome.jp/kegg/) +* [MirTarBase](https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/php/index.php) +* [Different resources from NCBI](https://www.ncbi.nlm.nih.gov/) +* [OffSides](http://tatonettilab.org/offsides/) +* [Pathway Commons](https://www.pathwaycommons.org/) +* [The Human Protein Atlas](https://www.proteinatlas.org/) +* [Reactome](https://reactome.org/) +* [STRING](https://string-db.org/) +* [UniProt]() + +## Installation + +The easiest way to install ebel is to use `docker-compose`. See below instructions to use the docker installation. + + +`ebel` can be directly installed from PyPi with pip +```bash +pip install ebel +``` + +But we want to encourage you to use the latest development version which can be installed with:: +```bash +pip install git+https://github.com/e-bel/ebel +``` + +## Package Requirements + +### Installing OrientDB + +This software package is designed to work in conjunction with OrientDB, a NoSQL, multi-model database +that acts as both a graph and relational database. e(BE:L) uses OrientDB for generating the knowledge graph derived from BEL files. To get +started with e(BE:L), first download OrientDB and get a server up and running. +The first time the server is started, you will need to create a root password. Once it is up and running, you can get +start importing BEL files into it! + +On Linux you can use following commands +```bash +wget https://repo1.maven.org/maven2/com/orientechnologies/orientdb-community/3.2.2/orientdb-community-3.2.2.tar.gz +tar -xvzf orientdb-community-3.2.2.tar.gz +cd orientdb-community-3.2.2/bin +./server.sh +``` + +### SQL Databases + +This package is capable of enriching the compiled knowledge graphs with a lot of external information, however, this requires +a SQL databases for storage. While, a SQLite database can be used, this is not recommended as the amount of data and +complexity of queries will be quite slow. Additionally, SQLite will not be directly supported, the methods will be built +such that they should work with both SQLite and MySQL, but we will not address performance issues due to using SQLite. + +Instead, we recommend setting up a [MySQL server](https://www.mysql.com/downloads/) or +MariaDB to use with e(BE:L). By default, [PyMySQL](https://pypi.org/project/PyMySQL/) +is installed as a driver by e(BE:L), but others can also be used. + +On Linux Ubuntu you can use following command +```bash +sudo apt install mysql-server -y +``` +or +```bash +sudo apt install mariadb-server -y +``` + + +### Configuration + +Before you start working with e(BE:L), a simple to use wizard helps you to setup all configurations. Make sure OrientDB +and MySQL (or MariaDB) are running. Then start the configuration wizard with + +```bash +ebel settings +``` + +The wizard will create the needed databases and users in OrientDB and MySQL/MariaDB. + +### Package Components + +To test the different components you find [here](https://github.com/e-bel/covid19_knowledge_graph/) several BEL and +already compiled JSON files. + +## BEL Validation + +BEL is a domain-specific language designed to capture biological relationships in a computer- and human-readable format. +The rules governing BEL statement generation can be quite complex and often mistakes are made during curation. +e(BE:L) includes a grammar and syntax checker that reads through given BEL files and validates whether each statement +satisfies the guidelines provided by [BEL.bio](https://language.bel.bio). Should any BEL statement within the file +not adhere to the rules, a report file is created by e(BE:L) explaining the error and offering suggested fixes. + +You can use the following command to validate your BEL file + +```bash +ebel validate /path/to/bel_file.bel +``` + +In a single command, you can validate your BEL file as well as generate error reports if there are errors and if there +are none, produce an importable JSON file:: + +```bash +ebel validate /path/to/bel_file.bel -r error_report.xlsx -j +``` + +BEL documents should be properly formatted prior to validation. e(BE:L) contains a repair tool that will check the format +and it is highly recommended that this is used prior to validation. The repaired will overwrite the original if a new file +path is not specified. Here is an example:: + +```bash +ebel repair /path/to/bel_file.bel -n /path/to/repaired_file.bel +``` + +## Import Process + +### BEL Modeling - OrientDB + +BEL files that have passed the validation process can be imported into the +database individually or *en masse*. During the import process, e(BE:L) automatically creates all the relevant nodes and edges +as described in the BEL files. Additionally, e(BE:L) also automatically adds in missing nodes and edges that are known to exist +e.g. protein nodes with a respective RNA or gene node with have these automatically added to the graph with the appropriate `translatedTo` and +`transcribedTo` edges. + + +Model Enrichment - MySQL +------------------------ + +e(BE:L) goes one step farther when compiling your BEL statements into a knowledge graph by supplementing your new graph model with information derived from several +publicly available repositories. Data is automatically downloaded from several useful sites including `UniProt` , +`Ensembl`, and `IntAct` and added as generic tables in your newly built database. +Information from these popular repositories are then linked to the nodes and edges residing in your graph model, allowing for more complex and +useful queries to be made against your data. This data is automatically downloaded, parsed, and imported into a specified SQL database. + +Importing - Getting Started +--------------------------- + +e(BE:L) supports OrientDB as graph database and [MySQL](https://www.mysql.com) and MariaDB as [RDBMS](https://en.wikipedia.org/wiki/Relational_database) + +Make sure you have downloaded/installed and running + +1. `OrientDB` +2. MySQL or MariaDB +3. Relational Database + * MySQL + - [Windows](https://dev.mysql.com/doc/refman/8.0/en/windows-installation.html>) + - [MacOS](https://dev.mysql.com/doc/refman/8.0/en/macos-installation.html>) + - Linux + * [Ubuntu, Debian, Linux Mint, ...](https://dev.mysql.com/doc/mysql-apt-repo-quick-guide/en/) + - [RedHat, Fedora, CentOS, OpenSUSE, Scientific Linux, ...](https://dev.mysql.com/doc/refman/8.0/en/linux-installation-yum-repo.html>) + * MariaDB + - [Windows](https://mariadb.com/kb/en/installing-mariadb-msi-packages-on-windows/) + - [MacOS PKG](https://mariadb.com/kb/en/installing-mariadb-server-pkg-packages-on-macos/) + - [Homebrew](https://mariadb.com/kb/en/installing-mariadb-on-macos-using-homebrew/) + - Linux + - [Ubuntu, Debian, Linux Mint, ...](https://mariadb.com/kb/en/yum/) + - [RedHat, Fedora, CentOS, OpenSUSE, Scientific Linux, ...](https://mariadb.com/kb/en/installing-mariadb-deb-files/) + +This can be configured as a service in both Windows and Unix systems. + +Set your MySQL connection parameters in e(BE:L) + +```bash +ebel set-mysql --host localhost --user root --password myPassWord --database ebel +``` + +Once you have made sure both OrientDB and MySQL are running, you can now import an e(BE:L) compiled JSON file + +```bash +ebel import-json /path/to/checked_bel.json -u root -p orientdbPassword -d ebel -h localhost -p 2424 +``` + +After you have successfully connected to the OrientDB database at least once, the login credentials will be written to the config file and no longer need to be passed (same with ``enrich`` command) + +```bash +ebel import-json /path/to/checked_bel.json +``` + +You can also import all e(BE:L) compiled JSON files in a passed directory + +```bash +ebel import-json /path/to/bel_json/dir/ +``` + +If you do no wish to enrich the graph, or wish to disable the protein/RNA/gene extension step, you can toggle these with the following options + +```bash +ebel import-json /path/to/checked_bel.json -e -g +``` + +You can run an enrichment step later using the ``enrich`` command + +```bash +ebel enrich +``` + +This command can also be given a list of resources to either skip or include during enrichment + +```bash +ebel enrich -i uniprot,hgnc +``` + +or + +```bash +ebel enrich -s intact,kegg +``` diff --git a/docs/openapi.yml b/docs/openapi.yml index 56b3e20..eb41e01 100644 --- a/docs/openapi.yml +++ b/docs/openapi.yml @@ -3405,10 +3405,6 @@ paths: in: query schema: type: string - - name: intermediate_filament_db - in: query - schema: - type: string - name: iuphar in: query schema: @@ -3437,10 +3433,6 @@ paths: in: query schema: type: string - - name: mamit_trnadb - in: query - schema: - type: string - name: merops in: query schema: @@ -3457,10 +3449,6 @@ paths: in: query schema: type: string - - name: pseudogene_org - in: query - schema: - type: string - name: snornabase in: query schema: diff --git a/docs/requirements.txt b/docs/requirements.txt index b8b0f72..4915927 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,6 +2,5 @@ mkdocs==1.3.1 mkdocs-autorefs==0.4.1 mkdocs-click==0.8.0 mkdocs-render-swagger-plugin==0.0.4 -mkdocstrings==0.19.0 -mkdocstrings-python==0.7.1 +mkdocstrings[python]>=0.18 jinja2<3.1.0 diff --git a/ebel/__init__.py b/ebel/__init__.py index 9387591..5cae1ea 100755 --- a/ebel/__init__.py +++ b/ebel/__init__.py @@ -1,22 +1,17 @@ """Root init for eBEL.""" -from . import parser -from . import constants -from . import errors -from . import transformers -from . import cache - +from . import cache, constants, errors, parser, transformers from .manager.orientdb.biodbs.bel import Bel -__version__ = '1.0.25' +__version__ = "1.0.37" -__title__ = 'e(BE:L)' -__description__ = 'Validation and extension of biomedical knowledge graphs' -__url__ = 'https://github.com/e-bel/ebel' +__title__ = "e(BE:L)" +__description__ = "Validation and extension of biomedical knowledge graphs" +__url__ = "https://github.com/e-bel/ebel" -__author__ = 'Christian Ebeling' -__email__ = 'christian.ebeling@scai.fraunhofer.de' +__author__ = "Christian Ebeling" +__email__ = "christian.ebeling@scai.fraunhofer.de" -__license__ = '?' +__license__ = "?" __copyright__ = """Copyright (c) 2021 Christian Ebeling, Fraunhofer Institute for Algorithms and Scientific Computing SCAI, Schloss Birlinghoven, 53754 Sankt Augustin, Germany""" diff --git a/ebel/__main__.py b/ebel/__main__.py index 8804b9e..bfe8987 100755 --- a/ebel/__main__.py +++ b/ebel/__main__.py @@ -2,5 +2,5 @@ from .cli import main -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ebel/cache.py b/ebel/cache.py index 3cb4864..7f4afb1 100755 --- a/ebel/cache.py +++ b/ebel/cache.py @@ -1,44 +1,36 @@ """Collection of methods for handling information caching.""" -import re -import logging import getpass -import pymysql +import logging -from lark.lexer import Token -from collections import defaultdict, Counter -from typing import Generator, List, Dict, DefaultDict, Set +import pymysql from ebel import defaults -from ebel.manager import models -from ebel.tools import BelRdb from ebel.config import write_to_config -from ebel.warnings import AlsoUsedInOtherNamespace, _Warning -from ebel.constants import URL, PATTERN, LIST, ALLOWED_TYPES -from ebel.constants import GRAMMAR_START_NS, GRAMMAR_START_ANNO -from ebel.errors import NotInNamespaceUrl, NotInAnnotationUrl, WithoutDefinedNamespace, \ - WithoutDefinedAnnotation, NotInNamespaceList, NotInAnnotationList, \ - NotInNamespacePattern, NotInAnnotationPattern, NotDownloadedFromUrl, _Error -# TODO: Because of change of `BelScript.ALLOWED_TYPES` FILE have to be handled on different way +# TODO: Decide whether to use these methods or those from database logger = logging.getLogger(__name__) def set_mysql_interactive() -> tuple: """Interactive mode to setup MySQL database.""" - print("Interactive mode\n \ + print( + "Interactive mode\n \ ================\n \ - 1st setup of db and user with root:\n") - host = input("host[localhost]:") or 'localhost' - user = input("ebel user[ebel]:") or 'ebel' - password = getpass.getpass(prompt="ebel password[ebel]:") or 'ebel' - db = input("database name[ebel]") or 'ebel' - print("If you want to setup the database automatically,\n \ - then type in the root password, otherwise nothing") - root_pwd = getpass.getpass(prompt='root password (only for 1st setup):') + 1st setup of db and user with root:\n" + ) + host = input("host[localhost]:") or "localhost" + user = input("ebel user[ebel]:") or "ebel" + password = getpass.getpass(prompt="ebel password[ebel]:") or "ebel" + db = input("database name[ebel]") or "ebel" + print( + "If you want to setup the database automatically,\n \ + then type in the root password, otherwise nothing" + ) + root_pwd = getpass.getpass(prompt="root password (only for 1st setup):") if root_pwd: - root_host = getpass.getpass(prompt='IP or name mysql server [localhost]:') or 'localhost' + root_host = getpass.getpass(prompt="IP or name mysql server [localhost]:") or "localhost" conn = pymysql.connect(host=root_host, user="root", password=root_pwd) c = conn.cursor() db_exists = c.execute("show databases like '{}'".format(db)) @@ -72,11 +64,13 @@ def set_mysql_interactive() -> tuple: return host, user, password, db -def set_mysql_connection(host: str = 'localhost', - user: str = 'ebel_user', - password: str = 'ebel_passwd', - db: str = 'ebel', - charset: str = 'utf8mb4'): +def set_mysql_connection( + host: str = "localhost", + user: str = "ebel_user", + password: str = "ebel_passwd", + db: str = "ebel", + charset: str = "utf8mb4", +): """Set the connection using MySQL Parameters. Parameters @@ -98,12 +92,9 @@ def set_mysql_connection(host: str = 'localhost', SQLAlchemy MySQL connection string. """ - connection_string = 'mysql+pymysql://{user}:{passwd}@{host}/{db}?charset={charset}'.format( - host=host, - user=user, - passwd=password, - db=db, - charset=charset) + connection_string = "mysql+pymysql://{user}:{passwd}@{host}/{db}?charset={charset}".format( + host=host, user=user, passwd=password, db=db, charset=charset + ) set_connection(connection_string) return connection_string @@ -118,7 +109,7 @@ def set_always_create_new_db(always_create_new_db: bool = True) -> None: Option `always_create_new_db` in section `database` in config file. """ - write_to_config('database', 'always_create_new', str(always_create_new_db)) + write_to_config("database", "always_create_new", str(always_create_new_db)) def set_connection(connection: str = defaults.CONN_STR_DEFAULT) -> None: @@ -130,726 +121,4 @@ def set_connection(connection: str = defaults.CONN_STR_DEFAULT) -> None: SQLAlchemy connection string. """ - write_to_config('database', 'sqlalchemy_connection_string', connection) - - -class _BelScript: - """Cache the content of the BEL script and methods to find errors and warnings.""" - - def __init__(self, force_new_db): - """Init.""" - # setup database - engine = BelRdb().engine - - self.force_new_db = force_new_db - models.reset_tables(engine, self.force_new_db) - - self._namespaces = Namespaces() # entries Namespace objects - self._annotations = Annotations() # entries Annotation objects - - self.__namespace_in_db_updated = False - self.__annotations_in_db_updated = False - - self._namespace_entries = NamespaceEntries() - self._annotation_entries = AnnotationEntries() - - self.notDownloadedFromUrls = [] - - self.namespace_manager = models.NamespaceManager( - model=models.Namespace, - entries_model=models.NamespaceEntry, - grammar_start=GRAMMAR_START_NS - ) - - self.annotation_manager = models.AnnotationManager( - model=models.Annotation, - entries_model=models.AnnotationEntry, - grammar_start=GRAMMAR_START_ANNO - ) - - def set_namespace_definition(self, as_type, keyword, value): - """Set an annotation definition with type, keyword and value value could be 'file', 'url' or 'list'. - - :param str as_type: 'file', 'url' or 'list' - :param str keyword: namespace keyword - :param str value: URL, file path or list - """ - if as_type in ALLOWED_TYPES: - self._namespaces.add(as_type, keyword, value) - return True - else: - logger.error("{} is not a allowed type of {}".format(as_type, ALLOWED_TYPES)) - return False - - def set_annotation_definition(self, as_type, keyword, value): - """Set an annotation definition with type, keyword and value could be 'file', 'url' or 'list'. - - :param str as_type: 'file', 'url' or 'list' - :param str keyword: namespace keyword - :param str value: URL, file path or list - """ - if as_type in ALLOWED_TYPES: - self._annotations.add(as_type, keyword, value) - return True - else: - logger.error("{} is not an allowed type of {}".format(as_type, ALLOWED_TYPES)) - return False - - def set_annotation_entry(self, annotation: str, entry: str, token: Token): - """Set annotation, entry and lark.lexer.Token token. - - :param str annotation: annotation - :param str entry: entry - :param lark.lexer.Token token: - """ - self._annotation_entries.set_annotation_entry(keyword=annotation, entry=entry, token=token) - - def set_namespace_entry(self, namespace: str, entry: str, token: Token): - """Set namespace, entry and lark.lexer.Token token. - - :param str namespace: - :param str entry: - :param lark.lexer.Token token: - """ - if not isinstance(token, Token): - raise Exception("expecting Token in cache.set_namespace_entry") - - self._namespace_entries.set_namespace_entry(keyword=namespace, entry=entry, token=token) - - @property - def errors(self) -> List[_Error]: - """Execute all methods to find errors and warnings.""" - self.update_database() - - # all errors are children from errors._Error instances - return (self.notDownloadedFromUrls - + self.entries_without_namespace - + self.entries_without_annotation - + self.entries_not_in_namespace_url - + self.entries_not_in_annotation_url - + self.entries_not_in_namespace_list - + self.entries_not_in_annotation_list - + self.entries_not_in_namespace_pattern - + self.entries_not_in_annotation_pattern - ) - - @property - def warnings(self) -> List[_Warning]: - """Execute all methods to find warnings.""" - if not (self.__namespace_in_db_updated and self.__annotations_in_db_updated): - self.update_database() - - # all warnings are children from warnings._Warning instances - return self.entries_also_in_other_namespace - - @property - def entries_also_in_other_namespace(self) -> List[AlsoUsedInOtherNamespace]: - """Return WithoutDefinedNamespace list.""" - ret = [] - # extract all entries used in BEL statements and create a dict of lower entries with all keywords - entry_keyword_dict = defaultdict(set) - for keyword1, entries in self._namespace_entries.entries.items(): - for entry in entries: - entry_keyword_dict[entry.lower()] |= {keyword1} - # identify all ambiguous entries (in more than 1 namespace) - ambiguous_entries = {entry: keywords for entry, keywords in entry_keyword_dict.items() if len(keywords) > 1} - - # ToDo: iterate all lower entries an check for permutation -# for lower_entry in entry_keyword_dict: -# if lower_entry.count(",") == 1: -# reverse_without_comma = " ".join([x.strip() for x in lower_entry.split(",")][::-1]) -# if reverse_without_comma in entry_keyword_dict: -# print(lower_entry, - # "%s exists in %s" % (reverse_without_comma, - # entry_keyword_dict[reverse_without_comma])) -# ret.append(AlsoUsedInOtherNamespace(keyword=keyword2, -# entry=entry, -# line_number=token.line, -# column=token.column, -# hint=hint)) - - # iterate all tokens with namespace entries and check if they are also exists in ambiguous entries - for keyword2, entries_tokens in self._namespace_entries.tokens.items(): - for entry, tokens in entries_tokens.items(): - if entry.lower() in ambiguous_entries: - ambiguous_tokens = self._namespace_entries.tokens[keyword2][entry] - for token in ambiguous_tokens: - hint = "%s exists also in %s" % (entry, ambiguous_entries[entry.lower()] - {keyword2}) - ret.append(AlsoUsedInOtherNamespace(keyword=keyword2, - entry=entry, - line_number=token.line, - column=token.column, - hint=hint)) - return ret - - @property - def entries_not_in_namespace_pattern(self) -> List[NotInNamespacePattern]: - """Return a list of entries not fitting a given namespace pattern.""" - ret = [] - - ns_pattern_kwds = self.used_namespace_keywords & self._namespaces.keywords_by_type(PATTERN) - - for kwd in ns_pattern_kwds: - regex = self._namespaces.keyword_dict[kwd].value - pattern = re.compile("^" + regex + "$") - elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(kwd) - for entry, line, column in elcs: - if not pattern.search(entry): - ret.append(NotInNamespacePattern(keyword=kwd, - entry=entry, - line_number=line, - column=column)) - return ret - - @property - def entries_not_in_annotation_pattern(self) -> List[NotInAnnotationPattern]: - """Return a list of entries not fitting a given annotation pattern.""" - ret = [] - - anno_pattern_kwds = self.used_annotation_keywords & self._annotations.keywords_by_type(PATTERN) - - for kwd in anno_pattern_kwds: - regex = self._annotations.keyword_dict[kwd].value - pattern = re.compile("^" + regex + "$") - elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(kwd) - for entry, line, column in elcs: - if not pattern.search(entry): - ret.append(NotInAnnotationPattern(keyword=kwd, - entry=entry, - line_number=line, - column=column)) - return ret - - @property - def entries_not_in_annotation_list(self) -> List[NotInAnnotationList]: - """Return a list of entries not in a given annotations.""" - ret = [] - - anno_kwd_used_and_as_list = self.used_annotation_keywords & self._annotations.keywords_by_type(LIST) - - for kwd in anno_kwd_used_and_as_list: - elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(kwd) - for entry, line, column in elcs: - if entry not in self._annotations.keyword_dict[kwd].value: - ret.append(NotInAnnotationList(keyword=kwd, - entry=entry, - line_number=line, - column=column)) - return ret - - @property - def entries_not_in_namespace_list(self) -> List[NotInNamespaceList]: - """Return a list of entries not in a given namespace.""" - ret = [] - - ns_kwd_used_and_as_list = self.used_namespace_keywords & self._namespaces.keywords_by_type(LIST) - - for kwd in ns_kwd_used_and_as_list: - elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(kwd) - for entry, line, column in elcs: - if entry not in self._namespaces.keyword_dict[kwd].value: - ret.append(NotInNamespaceList(keyword=kwd, - entry=entry, - line_number=line, - column=column)) - return ret - - @property - def entries_without_namespace(self) -> List[WithoutDefinedNamespace]: - """Return WithoutDefinedNamespace list.""" - ret = [] - for namespace_keyword in self.namespaces_without_definition: - elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(namespace_keyword) - for entry, line, column in elcs: - ret.append(WithoutDefinedNamespace(keyword=namespace_keyword, - entry=entry, - line_number=line, - column=column)) - return ret - - @property - def entries_without_annotation(self) -> List[WithoutDefinedAnnotation]: - """Return WithoutDefinedNamespace list.""" - ret = [] - for annotation_keyword in self.annotations_without_definition: - elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(annotation_keyword) - for entry, line, column in elcs: - ret.append(WithoutDefinedAnnotation(keyword=annotation_keyword, - entry=entry, - line_number=line, - column=column)) - return ret - - def update_database(self) -> None: - """Update namespace and annotation entries in database if not exists by url and keyword.""" - if not (self.__namespace_in_db_updated and self.__annotations_in_db_updated): - self.__namespace_in_db_updated = self.update_namespaces_in_db() - self.__annotations_in_db_updated = self.update_annotations_in_db() - - def set_entry_not_in_namespace_list_errors(self): - pass - - def set_entry_not_in_annotation_list_errors(self): - pass - - @property - def entries_not_in_namespace_url(self) -> List[NotInNamespaceUrl]: - """Return a list of entries not exists in namespaces referenced as URL. - - Returns - ------- - List[NotInNamespaceUrl] - Description of returned object. - """ - entries_not_in_namespace = [] - - for keyword in self.used_namespace_keywords: - - namespace = self._namespaces.keyword_dict[keyword] - - if namespace.as_type == URL: - - url = namespace.value - elc_list = self._namespace_entries.get_entry_line_column_list_by_keyword(keyword) - - names_not_exists = self.namespace_manager.get_entries_not_exists( - keyword=keyword, - url=url, - entry_line_column_list=elc_list, - ) - - for entry, line, column, hint in names_not_exists: - error = NotInNamespaceUrl( - keyword=keyword, - url_or_path=url, - entry=entry, - line_number=line, - column=column, - hint=hint - ) - entries_not_in_namespace.append(error) - - return entries_not_in_namespace - - @property - def entries_not_in_annotation_url(self) -> List[_Error]: - """Return a list of entries not in the annotation URL.""" - entries_not_in_annotation = [] - - for keyword in self.used_annotation_keywords: - - annotation = self._annotations.keyword_dict[keyword] - - if annotation.as_type == URL: - - url = annotation.value - elc_list = self._annotation_entries.get_entry_line_column_list_by_keyword(keyword) - - names_not_exists = self.annotation_manager.get_entries_not_exists( - keyword=keyword, - url=url, - entry_line_column_list=elc_list - ) - - for entry, line, column, hint in names_not_exists: - error = NotInAnnotationUrl( - keyword=keyword, - url_or_path=url, - entry=entry, - line_number=line, - column=column, - hint=hint - ) - entries_not_in_annotation.append(error) - return entries_not_in_annotation - - def update_annotations_in_db(self) -> bool: - """Update annotation in database if URL and keyword not exists.""" - import_success = True - for anno in self._annotations.to_update: - - if anno.keyword in self.used_annotation_keywords: - - if not self.annotation_manager.keyword_url_exists(keyword=anno.keyword, url=anno.value): - - if anno.as_type == URL: - - logger.info(f"Update db with annotation {anno.keyword}: download from {anno.value}") - - successful, error = self.annotation_manager.save_from_url_or_path( - keyword=anno.keyword, - url_or_path=anno.value, - doc_type=anno.as_type, - ) - - if not successful: - import_success = False - error_args = error.args[0].split("\n") - string_error = error_args[2] if len(error_args) > 1 else error_args[0] - logger.error(f"Annotation {anno.keyword} failed to be added from {anno.value}", - exc_info=False) - - if "column" in dir(error): # Indicates it's a Lark error - download_error = NotDownloadedFromUrl( - keyword=anno.keyword, - url_or_path=anno.value, - column=error.column, - line=error.line, - hint=f'{error.allowed} error in "{string_error}"' - ) - - else: # It's an HTTPError of some kind - download_error = NotDownloadedFromUrl( - keyword=anno.keyword, - url_or_path=anno.value, - column=0, - line=0, - hint=f"{string_error}" - ) - self.notDownloadedFromUrls.append(download_error) - - return import_success - - def update_namespaces_in_db(self) -> bool: - """Update namespaces in database if URL and keyword does not exist.""" - import_success = True - for ns in self._namespaces.to_update: - - if ns.keyword in self.used_namespace_keywords: - - if not self.namespace_manager.keyword_url_exists(keyword=ns.keyword, url=ns.value): - - if ns.as_type == URL: - - logger.info(f"Update db with namespace {ns.keyword}: download from {ns.value}") - - successful, error = self.namespace_manager.save_from_url_or_path( - keyword=ns.keyword, - url_or_path=ns.value, - doc_type=ns.as_type, - ) - - if not successful: - import_success = False - error_args = error.args[0].split("\n") - string_error = error_args[2] if len(error_args) > 1 else error_args[0] - logger.error(f"Namespace {ns.keyword} failed to be added from {ns.value}", - exc_info=False) - - if "column" in dir(error): # Indicates it's a Lark error - download_error = NotDownloadedFromUrl( - keyword=ns.keyword, - url_or_path=ns.value, - column=error.column, - line=error.line, - hint=f'{error.allowed} error in "{string_error}"' - ) - - else: # It's an HTTPError of some kind - download_error = NotDownloadedFromUrl( - keyword=ns.keyword, - url_or_path=ns.value, - column=0, - line=0, - hint=f"{string_error}" - ) - - self.notDownloadedFromUrls.append(download_error) - - return import_success - - @property - def namespaces_with_multiple_definitions(self): - """Return all Namespace objects with several definitions. - - This is returned as a dictionary (key:keyword, value: list of Namespace objects). - """ - ret = defaultdict(list) - multiple_keyword = [k for k, v in Counter(self._namespaces.keywords).items() if v > 1] - for ns in self._namespaces: - if ns.keyword in multiple_keyword: - ret[ns.keyword].append(ns) - return dict(ret) - - @property - def annotations_with_multiple_definitions(self): - """Return all Annotation objects with several definitions. - - This is returned as a dictionary (key:keyword, value: list of Annotation objects). - """ - ret = defaultdict(list) - multiple_keyword = [k for k, v in Counter(self._annotations.keywords).items() if v > 1] - for anno in self._annotations: - if anno.keyword in multiple_keyword: - ret[anno.keyword].append(anno) - return dict(ret) - - @property - def namespaces_without_definition(self): - """Return set of namespace keywords used in statements but not defined with a reference. - - :return set: set of str - """ - return set(self._namespace_entries.keywords) - set(self._namespaces.keywords) - - @property - def annotations_without_definition(self): - """Return a set of annotation keywords not defined with a reference. - - :return set: set of str - """ - return set(self._annotation_entries.keywords) - set(self._annotations.keywords) - - @property - def used_namespace_keywords(self) -> Set[str]: - """Return set of used namespace keywords (with reference and used in statements).""" - return set(self._namespace_entries.keywords) & set(self._namespaces.keywords) - - @property - def used_annotation_keywords(self) -> Set[str]: - """Return set of used namespace keywords.""" - return set(self._annotation_entries.keywords) & set(self._annotations.keywords) - - @property - def namespace_keywords_in_statements(self): - """Return all unique namespace keywords used in statements.""" - return self._namespace_entries.keywords - - @property - def annotation_keywords_in_statements(self): - """Return all unique annotation keywords used in statements.""" - return self._namespace_entries.keywords - - def get_entries_by_namespace_keyword(self, keyword): - """Get all entries by namespace keyword. - - :param keyword: namespace keyword - :return set: all entries in the namespace - """ - return self._namespace_entries.get_entries_by_keyword(keyword) - - def get_entries_by_annotation_keyword(self, keyword): - """Get all entries by namespace keyword. - - :param keyword: namespace keyword - :return set: all entries in the namespace - """ - return self._annotation_entries.get_entries_by_keyword(keyword) - - -class Entries: - """Abstract class representing namespaces and annotations.""" - - tokens = defaultdict(dict) - entries = defaultdict(set) - - def get_entry_line_column_list_by_keyword(self, keyword: str) -> Generator[str, int, int]: - """Get generator of tuple(entry, line, column) by keyword. - - Parameters - ---------- - keyword: str - Description of parameter `keyword: str`. - - Returns - ------- - Generator - Generator of tuple(entry: str, line: int, column: int). - """ - for entry, tokens in self.tokens[keyword].items(): - for token in tokens: - yield entry, token.line, token.column - - @property - def keywords(self): - """Return a list of unique keywords used in SETs.""" - return self.entries.keys() - - def get_entries_by_keyword(self, keyword: str) -> Set: - """Get entries by keyword. - - :param str keyword: keyword to retrieve from dict - """ - return self.entries.get(keyword, set()) - - def get_tokens_by_keyword(self, keyword: str) -> Dict: - """Get tokens by keyword. - - :param str keyword: keyword to retrieve from dict - """ - return self.entries.get(keyword, set()) - - def __str__(self): - """String representation of object.""" - return str(dict(self.tokens)) - - -class NamespaceEntries(Entries): - """Namespace subclass of Entries.""" - - def __init__(self): - """Init.""" - self.entries = defaultdict(set) - self.tokens = defaultdict(dict) - - def set_namespace_entry(self, keyword, entry, token): - """Set namespace, entry and lark.lexer.Token. - - :param str keyword: namespace - :param str entry: entry - :param lark.lexer.Token token: Token object from lark library - """ - if isinstance(token, Token): - self.entries[keyword] |= {entry} - if keyword in self.tokens and entry in self.tokens[keyword]: - self.tokens[keyword][entry].append(token) - else: - self.tokens[keyword][entry] = [token] - else: - raise "Argument token is type {} not {}".format(type(token), 'lark.lexer.Token') - - -class AnnotationEntries(Entries): - """Annotation subclass of Entries.""" - - def __init__(self): - """Init.""" - self.entries = defaultdict(set) - self.tokens = defaultdict(dict) - - def set_annotation_entry(self, keyword: str, entry, token): - """Set annotation, entry and lark.lexer.Token. - - :param str keyword: annotation - :param entry: entry - :param lark.lexer.Token token: Token object from lark library - """ - if isinstance(token, Token): - self.entries[keyword] |= {entry} - if keyword in self.tokens and entry in self.tokens[keyword]: - self.tokens[keyword][entry].append(token) - else: - self.tokens[keyword][entry] = [token] - else: - raise "argument token is type {} not {}".format(type(token), 'lark.lexer.Token') - - -class NsAnsBase: - """Parent class for class Namespace and Annotation.""" - - def __init__(self, obj_class): - """Init.""" - self.__objs = [] - self.class_ = obj_class - - def add(self, as_type: str, keyword: str, value: str): - """Add obj to list of objs. - - :param str as_type: allowed keywords 'file', 'url' or 'list' - :param str keyword: keyword used in object - :param str value: value of object - :return: - """ - obj = self.class_(as_type, keyword, value) - self.__objs.append(obj) - - @property - def type_dict(self) -> DefaultDict: - """Convert to list of dictionaries.""" - ret = defaultdict(list) - [ret[obj.as_type].append(obj) for obj in self] - return ret - - def by_type(self, as_type: str): - """Return list of Namespace objects by 'list', 'url' or 'file'.""" - if as_type not in ALLOWED_TYPES: - raise "{} not in allowed types {}".format(as_type, ALLOWED_TYPES) - return [obj for obj in self if obj.as_type == as_type] - - def keywords_by_type(self, as_type: str) -> Set[str]: - """Return a set of keywords by Namespace type 'list', 'url' or 'file'.""" - if as_type not in ALLOWED_TYPES: - raise "{} not in allowed types {}".format(as_type, ALLOWED_TYPES) - return set([obj.keyword for obj in self if obj.as_type == as_type]) - - @property - def keyword_dict(self) -> Dict: - """Return a dictionary of key=keyword, value: Namespace or Annotation object.""" - ret = dict() - for obj in self: - ret[obj.keyword] = obj - return ret - - @property - def keywords(self) -> List[str]: - """Return all keywords.""" - return [obj.keyword for obj in self.__objs] - - @property - def to_update(self) -> List: - """Return a list of all Namespace or Annotation (NS_or_Anno) objects with URL or file path. - - :return list: list of all Namespace or Annotation (NS_or_Anno) objects with URL or file path - """ - return self.type_dict[URL] - - def __iter__(self): - """Return a generator of objects (Namespace or Annotation).""" - for obj in self.__objs: - yield obj - - -class Namespaces(NsAnsBase): - """Namespace child class.""" - - def __init__(self): - """init.""" - super(Namespaces, self).__init__(obj_class=Namespace) - - -class Annotations(NsAnsBase): - """Annotation child class.""" - - def __init__(self): - """init.""" - super(Annotations, self).__init__(obj_class=Annotation) - - -class Namespace: - """Namespace class to represent BEL statement namespaces.""" - - def __init__(self, as_type, keyword, value): - """Namespace init.""" - self.as_type = as_type - self.keyword = keyword - self.value = value - - def to_dict(self): - """Convert class values to dictionary.""" - return {'as_type': self.as_type, 'keyword': self.keyword, 'value': self.value} - - def __unicode__(self): - return "Namespace:" + str(self.to_dict()) - - def __str__(self): - return self.__unicode__() - - -class Annotation: - """Annotation class to represent BEL statement annotations.""" - - def __init__(self, as_type, keyword, value): - """Annotation init.""" - self.as_type = as_type - self.keyword = keyword - self.value = value - - def to_dict(self): - """Convert class values to dictionary.""" - return {'as_type': self.as_type, 'keyword': self.keyword, 'value': self.value} - - def __unicode__(self): - return "Annotation" + str(self.to_dict()) - - def __str__(self): - return self.__unicode__() + write_to_config("database", "sqlalchemy_connection_string", connection) diff --git a/ebel/cli.py b/ebel/cli.py index e487074..9550134 100755 --- a/ebel/cli.py +++ b/ebel/cli.py @@ -1,15 +1,15 @@ """Command line interface to e(BE:L).""" -import sys import logging +import sys import click import ebel.database from ebel import Bel, web -from ebel.config import user_config_setup, set_configuration +from ebel.config import set_configuration, user_config_setup from ebel.manager.orientdb.constants import DRUGBANK -from ebel.validate import validate_bel_file, repair_bel_file +from ebel.validate import repair_bel_file, validate_bel_file logger = logging.getLogger(__name__) @@ -24,74 +24,144 @@ def main(): # TODO(@ceb): Implement by default "keep database" and new database as option #prio1 # TODO(@ceb): SQLAlchemy connection string #prio1 @main.command() -@click.argument('bel_script_path') -@click.option('-l', '--line_by_line', is_flag=True, default=False, help='check script line by line') -@click.option('-r', '--reports', default=None, - help='path(s) to report file(s) seperated by comma with suffix (.md, .txt, .csv, .tsv, .json, .html)') -@click.option('-v', '--bel_version', default="2_1", help='stores a report in a file') -@click.option('-t', '--tree', is_flag=True, default=False, help='shows tree') +@click.argument("bel_script_path") +@click.option( + "-l", + "--line_by_line", + is_flag=True, + default=False, + help="check script line by line", +) +@click.option( + "-r", + "--reports", + default=None, + help="path(s) to report file(s) seperated by comma with suffix (.md, .txt, .csv, .tsv, .json, .html)", +) +@click.option("-v", "--bel_version", default="2_1", help="stores a report in a file") +@click.option("-t", "--tree", is_flag=True, default=False, help="shows tree") # TODO: Implement new Cytoscape export file # @click.option('-c', '--cytoscape', is_flag=True, default=False, help='creates cytoscape file') -@click.option('-s', '--sqlalchemy_connection_str', default=None, help="SQLAlchmy connection string") -@click.option('-j', '--json_file', is_flag=True, help="Create json file") -@click.option('-f', '--force_json', is_flag=True, default=False, help="Force the creation of a JSON file") -def validate(bel_script_path: str, line_by_line: bool, reports: str, - bel_version: str, tree: bool, sqlalchemy_connection_str: str, - json_file: bool, force_json: bool): +@click.option( + "-s", + "--sqlalchemy_connection_str", + default=None, + help="SQLAlchmy connection string", +) +@click.option("-j", "--json_file", is_flag=True, help="Create json file") +@click.option( + "-f", + "--force_json", + is_flag=True, + default=False, + help="Force the creation of a JSON file", +) +def validate( + bel_script_path: str, + line_by_line: bool, + reports: str, + bel_version: str, + tree: bool, + sqlalchemy_connection_str: str, + json_file: bool, + force_json: bool, +): """Validate a BEL file using the defined grammar.""" - validate_bel_file(bel_script_path=bel_script_path, line_by_line=line_by_line, reports=reports, - bel_version=bel_version, tree=tree, sqlalchemy_connection_str=sqlalchemy_connection_str, - json_file=json_file, force_json=force_json) + validate_bel_file( + bel_script_path=bel_script_path, + line_by_line=line_by_line, + reports=reports, + bel_version=bel_version, + tree=tree, + sqlalchemy_connection_str=sqlalchemy_connection_str, + json_file=json_file, + force_json=force_json, + ) @main.command() -@click.argument('bel_script_path') -@click.option('-n', '--new_file_path', default=None, - help='Path to write repaired file to. If none passed, will overwrite original file.') -def repair(bel_script_path: str, new_file_path: str): +@click.argument("bel_script_path") +@click.argument("repaired_file_path") +@click.option( + "-d", + "--diff", + is_flag=True, + default=False, + help="Also export a file showing the differences between the original and repaired file.", +) +def repair(bel_script_path: str, repaired_file_path: str, diff: bool): """Repair the BEL file for common delimiters and line separations.""" - repair_bel_file(bel_script_path, new_file_path) + repair_bel_file(bel_script_path, repaired_file_path, diff=diff) @main.command() -@click.argument('json_file_path') -@click.option('-e', '--extend', is_flag=True, default=True, help='Flag to disable "extension" during import') -@click.option('-g', '--p2g', is_flag=True, default=True, help='Flag to disable "protein2gene" during import') -@click.option('-s', '--skip_drugbank', is_flag=True, default=False, help='Flag to disable DrugBank') -@click.option('-i', '--include_subfolders', is_flag=True, default=False, help='Flag to enable directory walking') -@click.option('--drugbank_user', help='Valid username for DrugBank') -@click.option('--drugbank_password', help='Valid username for DrugBank') -@click.option('-n', '--odb_name', help='OrientDB database name') -@click.option('-u', '--odb_user', help='OrientDB user (with admin rights)') -@click.option('-p', '--odb_password', help='OrientDB user (with admin rights) password') -@click.option('-h', '--odb_server', help='OrientDB server name or URI') -@click.option('-o', '--odb_port', help='OrientDB server port') -@click.option('--odb_user_reader', help=' OrientDB user with only read rights') -@click.option('--odb_user_reader_password', help='OrientDB user with only read rights password') -@click.option('--odb_root_password', help='OrientDB root user password (only during database setup)') -@click.option('--kegg_species', help='KEGG species') -@click.option('--sqlalchemy_connection_string', help='schema is user:password@server/database') -@click.option('--snp_related_traits', help='key of SNP related traits in GWAS catalog') -@click.option('--drugbank_user', help='Drugbank user') -@click.option('--drugbank_password', help='DrugBank password') -def import_json(json_file_path: str, - extend: bool, - p2g: bool, - skip_drugbank: bool, - drugbank_user: str, - drugbank_password: str, - include_subfolders: bool, - odb_name: str, - odb_user: str, - odb_password: str, - odb_server: str, - odb_port: str, - odb_user_reader: str, - odb_user_reader_password: str, - odb_root_password: str, - kegg_species: str, - sqlalchemy_connection_string: str, - snp_related_traits: str): +@click.argument("json_file_path") +@click.option( + "-e", + "--extend", + is_flag=True, + default=True, + help='Flag to disable "extension" during import', +) +@click.option( + "-g", + "--p2g", + is_flag=True, + default=True, + help='Flag to disable "protein2gene" during import', +) +@click.option( + "-s", + "--skip_drugbank", + is_flag=True, + default=False, + help="Flag to disable DrugBank", +) +@click.option( + "-i", + "--include_subfolders", + is_flag=True, + default=False, + help="Flag to enable directory walking", +) +@click.option("--drugbank_user", help="Valid username for DrugBank") +@click.option("--drugbank_password", help="Valid username for DrugBank") +@click.option("-n", "--odb_name", help="OrientDB database name") +@click.option("-u", "--odb_user", help="OrientDB user (with admin rights)") +@click.option("-p", "--odb_password", help="OrientDB user (with admin rights) password") +@click.option("-h", "--odb_server", help="OrientDB server name or URI") +@click.option("-o", "--odb_port", help="OrientDB server port") +@click.option("--odb_user_reader", help=" OrientDB user with only read rights") +@click.option("--odb_user_reader_password", help="OrientDB user with only read rights password") +@click.option( + "--odb_root_password", + help="OrientDB root user password (only during database setup)", +) +@click.option("--kegg_species", help="KEGG species") +@click.option("--sqlalchemy_connection_string", help="schema is user:password@server/database") +@click.option("--snp_related_traits", help="key of SNP related traits in GWAS catalog") +@click.option("--drugbank_user", help="Drugbank user") +@click.option("--drugbank_password", help="DrugBank password") +def import_json( + json_file_path: str, + extend: bool, + p2g: bool, + skip_drugbank: bool, + drugbank_user: str, + drugbank_password: str, + include_subfolders: bool, + odb_name: str, + odb_user: str, + odb_password: str, + odb_server: str, + odb_port: str, + odb_user_reader: str, + odb_user_reader_password: str, + odb_root_password: str, + kegg_species: str, + sqlalchemy_connection_string: str, + snp_related_traits: str, +): """Import JSON into OrientDB. Parameters @@ -139,66 +209,95 @@ def import_json(json_file_path: str, returns True if imported. """ # if one of the parameters is not None it will overwrite the default values from the configfile - set_configuration(name=odb_name, - user=odb_user, - password=odb_password, - server=odb_server, - port=odb_port, - user_reader=odb_user_reader, - user_reader_password=odb_user_reader_password, - root_password=odb_root_password, - kegg_species=kegg_species, - sqlalchemy_connection_string=sqlalchemy_connection_string, - snp_related_traits=snp_related_traits, - drugbank_user=drugbank_user, - drugbank_password=drugbank_password) + set_configuration( + name=odb_name, + user=odb_user, + password=odb_password, + server=odb_server, + port=odb_port, + user_reader=odb_user_reader, + user_reader_password=odb_user_reader_password, + root_password=odb_root_password, + kegg_species=kegg_species, + sqlalchemy_connection_string=sqlalchemy_connection_string, + snp_related_traits=snp_related_traits, + drugbank_user=drugbank_user, + drugbank_password=drugbank_password, + ) bel = Bel() if "," in json_file_path: json_file_path = json_file_path.split() - bel.import_json(input_path=json_file_path, - extend_graph=extend, - update_from_protein2gene=p2g, - skip_drugbank=skip_drugbank, - drugbank_user=drugbank_user, - drugbank_password=drugbank_password, - include_subfolders=include_subfolders) + bel.import_json( + input_path=json_file_path, + extend_graph=extend, + update_from_protein2gene=p2g, + skip_drugbank=skip_drugbank, + drugbank_user=drugbank_user, + drugbank_password=drugbank_password, + include_subfolders=include_subfolders, + ) @main.command() -@click.option('-s', '--skip', default=[], help='Comma-separated list of databases to skip during enrichment') -@click.option('-i', '--include', default=[], help='Comma-separated list of databases to include during enrichment') -@click.option('--skip_drugbank', is_flag=True, default=False, help='Flag to disable DrugBank') -@click.option('--drugbank_user', default=None, help='Valid username for DrugBank') -@click.option('--drugbank_password', default=None, help='Valid username for DrugBank') -@click.option('-n', '--odb_name', default=None, help='OrientDB database name') -@click.option('-u', '--odb_user', default=None, help='OrientDB user (with admin rights)') -@click.option('-p', '--odb_password', default=None, help='OrientDB user (with admin rights) password') -@click.option('-h', '--odb_server', default=None, help='OrientDB server name or URI') -@click.option('-o', '--odb_port', default=None, help='OrientDB server port') -@click.option('--odb_user_reader', default=None, help=' OrientDB user with only read rights') -@click.option('--odb_user_reader_password', default=None, help='OrientDB user with only read rights password') -@click.option('--odb_root_password', default=None, help='OrientDB root user password (only during database setup)') -@click.option('--kegg_species', default='hsa,rno,mmu', help='KEGG species') -@click.option('--sqlalchemy_connection_string', help='schema is user:password@server/database') -@click.option('--snp_related_traits', help='key of SNP related traits in GWAS catalog and ClinVar') -def enrich(skip: str, - include: str, - skip_drugbank: bool, - drugbank_user: str, - drugbank_password: str, - odb_name: str, - odb_user: str, - odb_password: str, - odb_server: str, - odb_port: str, - odb_user_reader: str, - odb_user_reader_password: str, - odb_root_password: str, - kegg_species: str, - sqlalchemy_connection_string: str, - snp_related_traits: str): +@click.option( + "-s", + "--skip", + default=[], + help="Comma-separated list of databases to skip during enrichment", +) +@click.option( + "-i", + "--include", + default=[], + help="Comma-separated list of databases to include during enrichment", +) +@click.option("--skip_drugbank", is_flag=True, default=False, help="Flag to disable DrugBank") +@click.option("--drugbank_user", default=None, help="Valid username for DrugBank") +@click.option("--drugbank_password", default=None, help="Valid username for DrugBank") +@click.option("-n", "--odb_name", default=None, help="OrientDB database name") +@click.option("-u", "--odb_user", default=None, help="OrientDB user (with admin rights)") +@click.option( + "-p", + "--odb_password", + default=None, + help="OrientDB user (with admin rights) password", +) +@click.option("-h", "--odb_server", default=None, help="OrientDB server name or URI") +@click.option("-o", "--odb_port", default=None, help="OrientDB server port") +@click.option("--odb_user_reader", default=None, help=" OrientDB user with only read rights") +@click.option( + "--odb_user_reader_password", + default=None, + help="OrientDB user with only read rights password", +) +@click.option( + "--odb_root_password", + default=None, + help="OrientDB root user password (only during database setup)", +) +@click.option("--kegg_species", default="hsa,rno,mmu", help="KEGG species") +@click.option("--sqlalchemy_connection_string", help="schema is user:password@server/database") +@click.option("--snp_related_traits", help="key of SNP related traits in GWAS catalog and ClinVar") +def enrich( + skip: str, + include: str, + skip_drugbank: bool, + drugbank_user: str, + drugbank_password: str, + odb_name: str, + odb_user: str, + odb_password: str, + odb_server: str, + odb_port: str, + odb_user_reader: str, + odb_user_reader_password: str, + odb_root_password: str, + kegg_species: str, + sqlalchemy_connection_string: str, + snp_related_traits: str, +): """Trigger the enrichment step for a database. Parameters @@ -236,19 +335,21 @@ def enrich(skip: str, snp_related_traits: str SNP related traits """ - set_configuration(name=odb_name, - user=odb_user, - password=odb_password, - server=odb_server, - port=odb_port, - user_reader=odb_user_reader, - user_reader_password=odb_user_reader_password, - root_password=odb_root_password, - kegg_species=kegg_species, - sqlalchemy_connection_string=sqlalchemy_connection_string, - snp_related_traits=snp_related_traits, - drugbank_user=drugbank_user, - drugbank_password=drugbank_password) + set_configuration( + name=odb_name, + user=odb_user, + password=odb_password, + server=odb_server, + port=odb_port, + user_reader=odb_user_reader, + user_reader_password=odb_user_reader_password, + root_password=odb_root_password, + kegg_species=kegg_species, + sqlalchemy_connection_string=sqlalchemy_connection_string, + snp_related_traits=snp_related_traits, + drugbank_user=drugbank_user, + drugbank_password=drugbank_password, + ) bel = Bel() @@ -265,7 +366,7 @@ def enrich(skip: str, @main.command() -@click.argument('connection') +@click.argument("connection") def set_connection(connection): """Set the SQLAlchemy connection string. @@ -275,22 +376,17 @@ def set_connection(connection): @main.command() -@click.option('-h', '--host', default='localhost', help="MySQL server") -@click.option('-u', '--user', default='ebel_user', help="MySQL username") -@click.option('-p', '--password', default='ebel_passwd', help="MySQL password") -@click.option('-d', '--database', default='ebel', help="MySQL database name") -@click.option('-i', '--interactive', is_flag=True, default=False, help="Enable interactive mode") +@click.option("-h", "--host", default="localhost", help="MySQL server") +@click.option("-u", "--user", default="ebel_user", help="MySQL username") +@click.option("-p", "--password", default="ebel_passwd", help="MySQL password") +@click.option("-d", "--database", default="ebel", help="MySQL database name") +@click.option("-i", "--interactive", is_flag=True, default=False, help="Enable interactive mode") def set_mysql(host: str, user: str, password: str, database: str, interactive: bool): """Set the SQLAlchemy connection string with MySQL settings.""" if interactive: host, user, password, db = ebel.database.set_mysql_interactive() - ebel.database.set_mysql_connection( - host=host, - user=user, - password=password, - db=database - ) + ebel.database.set_mysql_connection(host=host, user=user, password=password, db=database) @main.command() @@ -300,14 +396,14 @@ def settings(): @main.command() -@click.option('-h', '--host', default='0.0.0.0', help='Server or host name') -@click.option('-p', '--port', default='5000', help='server port [5000]') -@click.option('-d', '--debug_mode', is_flag=True, default=False, help='debug mode') -@click.option('-o', '--open_browser', is_flag=True, default=False, help='open browser') +@click.option("-h", "--host", default="0.0.0.0", help="Server or host name") +@click.option("-p", "--port", default="5000", help="server port [5000]") +@click.option("-d", "--debug_mode", is_flag=True, default=False, help="debug mode") +@click.option("-o", "--open_browser", is_flag=True, default=False, help="open browser") def serve(host, port, debug_mode, open_browser): """Start the API RESTful server.""" web.app.run(host=host, port=port, debug_mode=debug_mode, open_browser=open_browser) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ebel/config.py b/ebel/config.py index 01e10b1..6fbd8db 100644 --- a/ebel/config.py +++ b/ebel/config.py @@ -1,65 +1,68 @@ """Methods for handling the configuration file.""" import configparser import logging +import os import random import re -import os import string from collections import namedtuple from configparser import RawConfigParser from getpass import getpass -from typing import Union, Optional +from typing import Optional, Union from urllib.parse import quote import pymysql from ebel import defaults -from ebel.constants import DEFAULT_ODB, TerminalFormatting as TF +from ebel.constants import DEFAULT_ODB +from ebel.constants import TerminalFormatting as TF from ebel.defaults import CONN_STR_DEFAULT, DATABASE_LOCATION -def set_configuration(name: str = None, - user: str = None, - password: str = None, - server: str = None, - port: Union[str, int] = None, - user_reader: str = None, - user_reader_password: str = None, - root_password: str = None, - kegg_species: str = None, - sqlalchemy_connection_string: str = None, - snp_related_traits: str = None, - drugbank_user: str = None, - drugbank_password: str = None) -> dict: +def set_configuration( + name: Optional[str] = None, + user: Optional[str] = None, + password: Optional[str] = None, + server: Optional[str] = None, + port: Union[str, int, None] = None, + user_reader: Optional[str] = None, + user_reader_password: Optional[str] = None, + root_password: Optional[str] = None, + kegg_species: Optional[str] = None, + sqlalchemy_connection_string: Optional[str] = None, + snp_related_traits: Optional[str] = None, + drugbank_user: Optional[str] = None, + drugbank_password: Optional[str] = None, +) -> dict: """Set configuration values in the config file.""" odb_class_attribs = { - 'name': name, - 'user': user, - 'password': password, - 'server': server, - 'port': port, - 'user_reader': user_reader, - 'user_reader_password': user_reader_password, - 'root_password': root_password + "name": name, + "user": user, + "password": password, + "server": server, + "port": port, + "user_reader": user_reader, + "user_reader_password": user_reader_password, + "root_password": root_password, } for param, value in odb_class_attribs.items(): write_to_config(DEFAULT_ODB, param, value) if kegg_species: - write_to_config('KEGG', 'species', kegg_species) + write_to_config("KEGG", "species", kegg_species) if sqlalchemy_connection_string: - write_to_config('DATABASE', 'sqlalchemy_connection_string', sqlalchemy_connection_string) + write_to_config("DATABASE", "sqlalchemy_connection_string", sqlalchemy_connection_string) if snp_related_traits: - write_to_config('SNP_RELATED_TRAITS', 'keyword', snp_related_traits) + write_to_config("SNP_RELATED_TRAITS", "keyword", snp_related_traits) if drugbank_user: - write_to_config('DRUGBANK', 'user', drugbank_user) + write_to_config("DRUGBANK", "user", drugbank_user) if drugbank_password: - write_to_config('DRUGBANK', 'password', drugbank_password) + write_to_config("DRUGBANK", "password", drugbank_password) return get_config_as_dict() @@ -81,16 +84,16 @@ def write_to_config(section: str, option: str, value: str) -> None: config = RawConfigParser() if not os.path.exists(cfp): - with open(cfp, 'w') as config_file: + with open(cfp, "w") as config_file: config[section] = {option: value} config.write(config_file) - logging.info(f'Set in configuration file {cfp} in section {section} {option}={value}') + logging.info(f"Set in configuration file {cfp} in section {section} {option}={value}") else: config.read(cfp) if not config.has_section(section): config.add_section(section) config.set(section, option, value) - with open(cfp, 'w') as configfile: + with open(cfp, "w") as configfile: config.write(configfile) @@ -113,47 +116,86 @@ def user_config_setup(config_exists: bool = True) -> dict: print(f"\n{TF.TITLE}e(BE:L) Configuration WIZARD{TF.RESET}") print("\nThe following questionnaire will guide you through the configuration process.\n") - print("Before we start: Make sure\n\t1. OrientDB and\n\t2. MySQL/MariaDB (optional)\n are running and " - "you have the root password for both, if databases and users do not already exist.\n") - print(f"Default {TF.DEFAULT_VALUE}[values]{TF.RESET} are written in square brackets and " - f"can be confirmed by RETURN.\n") + print( + "Before we start: Make sure\n\t1. OrientDB and\n\t2. MySQL/MariaDB (optional)\n are running and " + "you have the root password for both, if databases and users do not already exist.\n" + ) + print( + f"Default {TF.DEFAULT_VALUE}[values]{TF.RESET} are written in square brackets and " + f"can be confirmed by RETURN.\n" + ) print(f"{TF.Format.UNDERLINED}Installation references{TF.RESET}") print(f"\t OrientDB: {TF.Fore.BLUE}https://orientdb.org/docs/3.1.x/fiveminute/java.html{TF.RESET}") - print(f"\t MySQL: {TF.Fore.BLUE}" - f"https://dev.mysql.com/doc/mysql-getting-started/en/#mysql-getting-started-installing{TF.RESET}") + print( + f"\t MySQL: {TF.Fore.BLUE}" + f"https://dev.mysql.com/doc/mysql-getting-started/en/#mysql-getting-started-installing{TF.RESET}" + ) print(f"\t MariaDB: {TF.Fore.BLUE}https://mariadb.com/kb/en/getting-installing-and-upgrading-mariadb/{TF.RESET}\n") # RDBMS setup - old_sa_con_str = old_configs.get('sqlalchemy_connection_string', '').strip() - configs['sqlalchemy_connection_string'] = __user_rdbms_setup(old_sa_con_str) + old_sa_con_str = old_configs.get("sqlalchemy_connection_string", "").strip() + configs["sqlalchemy_connection_string"] = __user_rdbms_setup(old_sa_con_str) # OrientDB Setup print(f"\n{TF.HEADER}Graph database (OrientDB) settings{TF.RESET}") - Configuration = namedtuple('Configuration', ['name', - 'question', - 'default', - 'validation_regex', - 'is_password', - 'required']) + Configuration = namedtuple( + "Configuration", + ["name", "question", "default", "validation_regex", "is_password", "required"], + ) old_configs_odb = old_configs.get(DEFAULT_ODB, {}) orientdb_configs = { - 'name': ("OrientDB database name (created if not exists)", - old_configs_odb.get('name', 'ebel'), r'^[A-Za-z]\w{2,}$', False, True), - 'user': ("OrientDB user (admin) name (created if not exists)", - old_configs_odb.get('user', 'ebel_user'), r'^[A-Za-z]\w{2,}$', False, True), - 'password': ("OrientDB user (admin) password (created if not exists)", - ''.join(random.sample(string.ascii_letters, 12)), None, True, True), - 'server': ("OrientDB server", - old_configs_odb.get('server', 'localhost'), None, False, True), - 'port': ("OrientDB port", - old_configs_odb.get('port', '2424'), r'^\d+$', False, True), - 'user_reader': ("OrientDB user (reader) name (created if not exists).", - old_configs_odb.get('user_reader', 'ebel_reader'), r'^[A-Za-z]\w{2,}$', False, False), - 'user_reader_password': ("OrientDB user (reader) password (created if not exists).", - ''.join(random.sample(string.ascii_letters, 12)), None, True, True), + "name": ( + "OrientDB database name (created if not exists)", + old_configs_odb.get("name", "ebel"), + r"^[A-Za-z]\w{2,}$", + False, + True, + ), + "user": ( + "OrientDB user (admin) name (created if not exists)", + old_configs_odb.get("user", "ebel_user"), + r"^[A-Za-z]\w{2,}$", + False, + True, + ), + "password": ( + "OrientDB user (admin) password (created if not exists)", + "".join(random.sample(string.ascii_letters, 12)), + None, + True, + True, + ), + "server": ( + "OrientDB server", + old_configs_odb.get("server", "localhost"), + None, + False, + True, + ), + "port": ( + "OrientDB port", + old_configs_odb.get("port", "2424"), + r"^\d+$", + False, + True, + ), + "user_reader": ( + "OrientDB user (reader) name (created if not exists).", + old_configs_odb.get("user_reader", "ebel_reader"), + r"^[A-Za-z]\w{2,}$", + False, + False, + ), + "user_reader_password": ( + "OrientDB user (reader) password (created if not exists).", + "".join(random.sample(string.ascii_letters, 12)), + None, + True, + True, + ), } for param_name, options in orientdb_configs.items(): @@ -179,20 +221,24 @@ def user_config_setup(config_exists: bool = True) -> dict: # KEGG print(f"\n{TF.HEADER}KEGG settings{TF.RESET}") - kegg_question = f"{TF.QUESTION}KEGG species as 3-4 letter code comma separated.{TF.RESET} " \ - "(see here for the KEGG organism letter codes: " \ - f"{TF.Fore.BLUE}https://www.genome.jp/kegg/catalog/org_list4.html{TF.RESET} ) " \ - f"{TF.DEFAULT_VALUE}[hsa,rno,mmu]: {TF.RESET}" - configs['kegg_species'] = input(kegg_question).strip() or 'hsa,rno,mmu' + kegg_question = ( + f"{TF.QUESTION}KEGG species as 3-4 letter code comma separated.{TF.RESET} " + "(see here for the KEGG organism letter codes: " + f"{TF.Fore.BLUE}https://www.genome.jp/kegg/catalog/org_list4.html{TF.RESET} ) " + f"{TF.DEFAULT_VALUE}[hsa,rno,mmu]: {TF.RESET}" + ) + configs["kegg_species"] = input(kegg_question).strip() or "hsa,rno,mmu" print(f"\n{TF.HEADER}SNP related traits settings{TF.RESET}") # SNP - default_snp_related_traits = old_configs.get('snp_related_traits') or 'Alzheimer,Parkinson' - snp_related_traits_question = f"{TF.QUESTION}SNPs related to (separated by comma){TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{default_snp_related_traits}]: {TF.RESET}" + default_snp_related_traits = old_configs.get("snp_related_traits") or "Alzheimer,Parkinson" + snp_related_traits_question = ( + f"{TF.QUESTION}SNPs related to (separated by comma){TF.RESET} " + f"{TF.DEFAULT_VALUE}[{default_snp_related_traits}]: {TF.RESET}" + ) snp_related_traits = input(snp_related_traits_question).strip() - configs['snp_related_traits'] = snp_related_traits or default_snp_related_traits + configs["snp_related_traits"] = snp_related_traits or default_snp_related_traits # DrugBank print(f"\n{TF.HEADER}DrugBank settings{TF.RESET}") @@ -200,8 +246,8 @@ def user_config_setup(config_exists: bool = True) -> dict: drugbank_user = input(f"{TF.QUESTION}DrugBank user: {TF.RESET}").strip() if drugbank_user: - configs['drugbank_user'] = drugbank_user - configs['drugbank_password'] = getpass(f"{TF.QUESTION}DrugBank password: {TF.RESET}") + configs["drugbank_user"] = drugbank_user + configs["drugbank_password"] = getpass(f"{TF.QUESTION}DrugBank password: {TF.RESET}") current_config = set_configuration(**configs) @@ -215,14 +261,19 @@ def __user_orientdb_setup(): def __user_rdbms_setup(prev_conn: str) -> str: """The initial setup process for the RDBMS.""" - db_choice = input("""e(BE:L) requires some basic information in order to begin importing data and building a + db_choice = ( + input( + """e(BE:L) requires some basic information in order to begin importing data and building a Knowledge Graph. The nodes and edges compiled from BEL statements are imported into OrientDB while the information parsed from external repositories is stored in a more traditional relational database and uses either SQLite or MySQL. While SQLite is easier to set up and does not require installing additional software, MySQL is the recommended option due to the amount of information that will be imported. -MySQL/SQLite [MySQL]: """) or "mysql" +MySQL/SQLite [MySQL]: """ + ) + or "mysql" + ) while db_choice.lower() not in ("sqlite", "mysql"): db_choice = input("Bad input, please enter either 'MySQL' or 'SQLite': ") @@ -245,35 +296,43 @@ def __mysql_setup(old_sa_con_str: str) -> str: old_mysql = {} if old_sa_con_str: - regex_con_str = r"^mysql\+pymysql://(?P.*?):" \ - r"(?P.*?)@(?P.*?)/(?P.*)$" + regex_con_str = ( + r"^mysql\+pymysql://(?P.*?):" r"(?P.*?)@(?P.*?)/(?P.*)$" + ) found_old_mysql = re.search(regex_con_str, old_sa_con_str) if found_old_mysql: old_mysql = found_old_mysql.groupdict() - default_mysql_host = old_mysql.get('mysql_host') or 'localhost' - mysql_host_question = f"{TF.QUESTION}MySQL/MariaDB server name{TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{default_mysql_host}]: {TF.RESET}" + default_mysql_host = old_mysql.get("mysql_host") or "localhost" + mysql_host_question = ( + f"{TF.QUESTION}MySQL/MariaDB server name{TF.RESET} " f"{TF.DEFAULT_VALUE}[{default_mysql_host}]: {TF.RESET}" + ) mysql_host = input(mysql_host_question) or default_mysql_host - default_mysql_port = old_mysql.get('mysql_db') or '3306' - mysql_port_question = f"{TF.QUESTION}MySQL/MariaDB port{TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{default_mysql_port}]: {TF.RESET}" + default_mysql_port = old_mysql.get("mysql_db") or "3306" + mysql_port_question = ( + f"{TF.QUESTION}MySQL/MariaDB port{TF.RESET} " f"{TF.DEFAULT_VALUE}[{default_mysql_port}]: {TF.RESET}" + ) mysql_port = input(mysql_port_question).strip() or default_mysql_port - default_mysql_user = old_mysql.get('mysql_user') or 'ebel' - mysql_user_question = f"{TF.QUESTION}MySQL/MariaDB (non-root) user{TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{default_mysql_user}]: {TF.RESET}" + default_mysql_user = old_mysql.get("mysql_user") or "ebel" + mysql_user_question = ( + f"{TF.QUESTION}MySQL/MariaDB (non-root) user{TF.RESET} " + f"{TF.DEFAULT_VALUE}[{default_mysql_user}]: {TF.RESET}" + ) mysql_user = input(mysql_user_question).strip() or default_mysql_user - mysql_random_password = ''.join(random.sample(string.ascii_letters, 12)) - mysql_passed_question = f"{TF.QUESTION}MySQL/MariaDB password for user{TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{mysql_random_password}]: {TF.RESET}" + mysql_random_password = "".join(random.sample(string.ascii_letters, 12)) + mysql_passed_question = ( + f"{TF.QUESTION}MySQL/MariaDB password for user{TF.RESET} " + f"{TF.DEFAULT_VALUE}[{mysql_random_password}]: {TF.RESET}" + ) mysql_pwd = getpass(mysql_passed_question).strip() or mysql_random_password - default_mysql_db = old_mysql.get('mysql_db') or 'ebel' - mysql_db_question = f"{TF.QUESTION}MySQL/MariaDB database name{TF.RESET} " \ - f"{TF.DEFAULT_VALUE}[{default_mysql_db}]: {TF.RESET}" + default_mysql_db = old_mysql.get("mysql_db") or "ebel" + mysql_db_question = ( + f"{TF.QUESTION}MySQL/MariaDB database name{TF.RESET} " f"{TF.DEFAULT_VALUE}[{default_mysql_db}]: {TF.RESET}" + ) mysql_db = input(mysql_db_question).strip() or default_mysql_db db_conn = f"mysql+pymysql://{mysql_user}:{quote(mysql_pwd)}@{mysql_host}:{mysql_port}/{mysql_db}?charset=utf8mb4" @@ -282,10 +341,11 @@ def __mysql_setup(old_sa_con_str: str) -> str: pymysql.connect(host=mysql_host, user=mysql_user, password=mysql_pwd, db=mysql_db) except pymysql.err.OperationalError: - mysql_root_passwd = getpass(f"{TF.QUESTION}MySQL root password (will be not stored) " - f"to create database and user: {TF.RESET}") - print(mysql_host, 'root', mysql_root_passwd) - cursor = pymysql.connect(host=mysql_host, user='root', password=mysql_root_passwd).cursor() + mysql_root_passwd = getpass( + f"{TF.QUESTION}MySQL root password (will be not stored) " f"to create database and user: {TF.RESET}" + ) + print(mysql_host, "root", mysql_root_passwd) + cursor = pymysql.connect(host=mysql_host, user="root", password=mysql_root_passwd).cursor() db_exists = cursor.execute("show databases like %s", mysql_db) if not db_exists: cursor.execute(f"CREATE DATABASE {mysql_db} CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci") diff --git a/ebel/constants.py b/ebel/constants.py index 38edeec..4d8c28b 100755 --- a/ebel/constants.py +++ b/ebel/constants.py @@ -4,9 +4,9 @@ import os THIS_DIR = os.path.dirname(__file__) -PROJECT_NAME = 'ebel' +PROJECT_NAME = "ebel" -HOME = os.path.expanduser('~') +HOME = os.path.expanduser("~") LIBRARY_NAME = PROJECT_NAME # Path to folder @@ -15,30 +15,31 @@ os.mkdir(PROJECT_DIR) # Path to data folder -DATA_DIR = os.path.join(PROJECT_DIR, 'data') +DATA_DIR = os.path.join(PROJECT_DIR, "data") if not os.path.exists(DATA_DIR): os.mkdir(DATA_DIR) # Path to logs folder -LOG_DIR = os.path.join(PROJECT_DIR, 'logs') +LOG_DIR = os.path.join(PROJECT_DIR, "logs") if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) # Default database name and location -DB_NAME = '{}.db'.format(PROJECT_NAME) +DB_NAME = "{}.db".format(PROJECT_NAME) DB_PATH = os.path.join(DATA_DIR, DB_NAME) -GRAMMAR_BEL_PATH = {'2': os.path.join(THIS_DIR, 'grammar', 'grammar_bel_2.bnf'), - '2_1': os.path.join(THIS_DIR, 'grammar', 'grammar_bel_2_1.bnf'), - } +GRAMMAR_BEL_PATH = { + "2": os.path.join(THIS_DIR, "grammar", "grammar_bel_2.bnf"), + "2_1": os.path.join(THIS_DIR, "grammar", "grammar_bel_2_1.bnf"), +} -GRAMMAR_NS_ANNO_PATH = os.path.join(THIS_DIR, 'grammar', 'grammar_belns_belanno_1__2.bnf') -GRAMMAR_START_NS = 'belns' -GRAMMAR_START_ANNO = 'belanno' -GRAMMAR_START_LINE = 'script_line_by_line' +GRAMMAR_NS_ANNO_PATH = os.path.join(THIS_DIR, "grammar", "grammar_belns_belanno_1__2.bnf") +GRAMMAR_START_NS = "belns" +GRAMMAR_START_ANNO = "belanno" +GRAMMAR_START_LINE = "script_line_by_line" # Variables -RID = 'rid' +RID = "rid" URL = "URL" PATTERN = "PATTERN" @@ -50,37 +51,37 @@ # Protein Modifications PMOD = { - 'pmod_ace': "Ac", - 'pmod_adr': "ADPRib", - 'pmod_add': "ADP-rybosylation", - 'pmod_far': "Farn", - 'pmod_ger': "Gerger", - 'pmod_gly': "Glyco", - 'pmod_hyd': "Hy", - 'pmod_isg': "ISG", - 'pmod_me0': "Me", - 'pmod_me1': "Me1", - 'pmod_mon': "monomethylation", - 'pmod_me2': "Me2", - 'pmod_me3': "Me3", - 'pmod_tri': "trimethylation", - 'pmod_myr': "Myr", - 'pmod_ned': "Nedd", - 'pmod_ngl': "NGlyco", - 'pmod_nit': "NO", - 'pmod_ogl': "OGlyco", - 'pmod_pal': "Palm", - 'pmod_pho': "Ph", - 'pmod_sul': "Sulf", - 'pmod_sup': "sulphation", - 'pmod_suh': "sulfonation", - 'pmod_sum': "Sumo", - 'pmod_suy': "Ub", - 'pmod_ubi': "ubiquitinylation", - 'pmod_u48': "UbK48", - 'pmod_u63': "UbK63", - 'pmod_ubm': "UbMono", - 'pmod_ubp': "UbPoly" + "pmod_ace": "Ac", + "pmod_adr": "ADPRib", + "pmod_add": "ADP-rybosylation", + "pmod_far": "Farn", + "pmod_ger": "Gerger", + "pmod_gly": "Glyco", + "pmod_hyd": "Hy", + "pmod_isg": "ISG", + "pmod_me0": "Me", + "pmod_me1": "Me1", + "pmod_mon": "monomethylation", + "pmod_me2": "Me2", + "pmod_me3": "Me3", + "pmod_tri": "trimethylation", + "pmod_myr": "Myr", + "pmod_ned": "Nedd", + "pmod_ngl": "NGlyco", + "pmod_nit": "NO", + "pmod_ogl": "OGlyco", + "pmod_pal": "Palm", + "pmod_pho": "Ph", + "pmod_sul": "Sulf", + "pmod_sup": "sulphation", + "pmod_suh": "sulfonation", + "pmod_sum": "Sumo", + "pmod_suy": "Ub", + "pmod_ubi": "ubiquitinylation", + "pmod_u48": "UbK48", + "pmod_u63": "UbK63", + "pmod_ubm": "UbMono", + "pmod_ubp": "UbPoly", } CYTOSCAPE_TEMPLATE = """{ @@ -112,17 +113,17 @@ # Edge types # Causal -INCREASES = 'increases' -DIRECTLY_INCREASES = 'directly_increases' +INCREASES = "increases" +DIRECTLY_INCREASES = "directly_increases" CAUSAL_INCREASE = {INCREASES, DIRECTLY_INCREASES} -DECREASES = 'decreases' -DIRECTLY_DECREASES = 'directly_decreases' +DECREASES = "decreases" +DIRECTLY_DECREASES = "directly_decreases" CAUSAL_DECREASE = {DECREASES, DIRECTLY_DECREASES} # Correlative -POSITIVE_CORRELATION = 'positive_correlation' -NEGATIVE_CORRELATION = 'negative_correlation' +POSITIVE_CORRELATION = "positive_correlation" +NEGATIVE_CORRELATION = "negative_correlation" # Terminal Colors diff --git a/ebel/database.py b/ebel/database.py index de0dde5..7a006ec 100644 --- a/ebel/database.py +++ b/ebel/database.py @@ -1,17 +1,18 @@ """Methods for interfacing with the RDBMS.""" +import getpass import logging import sys -import getpass from typing import Optional, Union import pymysql from pyorientdb import OrientDB -from pyorientdb.exceptions import PyOrientCommandException, PyOrientConnectionException, \ - PyOrientSecurityAccessException +from pyorientdb.exceptions import (PyOrientCommandException, + PyOrientConnectionException, + PyOrientSecurityAccessException) -from ebel.defaults import CONN_STR_DEFAULT -from ebel.config import write_to_config +from ebel.config import get_config_as_dict, write_to_config from ebel.constants import TerminalFormatting as TF +from ebel.defaults import CONN_STR_DEFAULT logger = logging.getLogger(__name__) @@ -31,11 +32,19 @@ def orientdb_connection_works(server: str, port: int, name: str, user: str, pass return works -def get_orientdb_client(server: str, port: int, name: str, user: str, password: str, - root_password: Optional[str] = None, user_reader: Optional[str] = None, - user_reader_password: Optional[str] = None) -> OrientDB: +def get_orientdb_client( + server: str, + port: int, + name: str, + user: str, + password: str, + root_password: Optional[str] = None, + user_reader: Optional[str] = None, + user_reader_password: Optional[str] = None, +) -> OrientDB: """Attempts to connect to the OrientDB client. This is currently done by using session tokens.""" # TODO PyOrientStorageException occurs with newer ODB versions + server = get_config_as_dict()["DEFAULT_ODB"]["server"] client = OrientDB(server, port) # First try connect as admin user if this fails connect root_user from config, @@ -44,83 +53,93 @@ def get_orientdb_client(server: str, port: int, name: str, user: str, password: client.set_session_token(True) client.db_open(name, user, password) - except (PyOrientCommandException, PyOrientConnectionException, PyOrientSecurityAccessException): + except ( + PyOrientCommandException, + PyOrientConnectionException, + PyOrientSecurityAccessException, + ): client.set_session_token(True) root_passwd_correct = False while not root_passwd_correct: odb_root_question = f"{TF.QUESTION}OrientDB root password (to create database and users): {TF.RESET}" root_password = root_password or getpass.getpass(odb_root_question) try: - client.connect('root', root_password) + client.connect("root", root_password) root_passwd_correct = True except (PyOrientConnectionException, PyOrientSecurityAccessException): - logger.error(f'Connection problem to OrientDB server {server}:{port}') - print(f"Please make sure the OrientDB server is running, port ({port}), " - f"as well server ({server}) and root password are correct") + logger.error(f"Connection problem to OrientDB server {server}:{port}") + print( + f"Please make sure the OrientDB server is running, port ({port}), " + f"as well server ({server}) and root password are correct" + ) sys.exit() if not client.db_exists(name): client.db_create(name) logger.info(f"Create database '{name}'") - client.db_open(name, 'root', root_password) + client.db_open(name, "root", root_password) # create user with admin rights - client.command( - f"CREATE USER {user} IDENTIFIED BY {password} ROLE admin") + client.command(f"CREATE USER {user} IDENTIFIED BY {password} ROLE admin") # create a reader if user_reader and user_reader_password: - client.command( - f"CREATE USER {user_reader} IDENTIFIED BY {user_reader_password} ROLE reader") + client.command(f"CREATE USER {user_reader} IDENTIFIED BY {user_reader_password} ROLE reader") client.close() # reopen with new user and password client = OrientDB(server, int(port)) client.set_session_token(True) client.db_open(name, user, password) else: - client.db_open(name, 'root', root_password) + client.db_open(name, "root", root_password) # admin - admin_user_exists_sql = "Select true as admin_user_exists from OUser " \ - f"where name = '{user}' and status='ACTIVE' and 'admin' in roles.name" + admin_user_exists_sql = ( + "Select true as admin_user_exists from OUser " + f"where name = '{user}' and status='ACTIVE' and 'admin' in roles.name" + ) admin_user_exists = client.command(admin_user_exists_sql) if admin_user_exists: print("Update password for OrientDB admin") - client.command( - f"UPDATE OUser SET password = '{password}' WHERE name = '{user}'") + client.command(f"UPDATE OUser SET password = '{password}' WHERE name = '{user}'") else: print("Create password for OrientDB reader") - client.command( - f"CREATE USER {user} IDENTIFIED BY {password} ROLE admin") + client.command(f"CREATE USER {user} IDENTIFIED BY {password} ROLE admin") # reader if user_reader and user_reader_password: - reader_user_exists_sql = "Select true as admin_user_exists from OUser " \ - f"where name = '{user_reader}' and status='ACTIVE' and 'reader' in roles.name" + reader_user_exists_sql = ( + "Select true as admin_user_exists from OUser " + f"where name = '{user_reader}' and status='ACTIVE' and 'reader' in roles.name" + ) reader_user_exists = client.command(reader_user_exists_sql) if reader_user_exists: client.command( - f"UPDATE OUser SET password = '{user_reader_password}' WHERE name = '{user_reader}'") + f"UPDATE OUser SET password = '{user_reader_password}' WHERE name = '{user_reader}'" + ) else: - client.command( - f"CREATE USER {user_reader} IDENTIFIED BY {user_reader_password} ROLE admin") + client.command(f"CREATE USER {user_reader} IDENTIFIED BY {user_reader_password} ROLE admin") return client def set_mysql_interactive() -> tuple: """Interactive mode to setup MySQL database.""" - print("Interactive mode\n \ + print( + "Interactive mode\n \ ================\n \ - 1st setup of db and user with root:\n") - host = input("host[localhost]:") or 'localhost' - user = input("ebel user[ebel]:") or 'ebel' - password = getpass.getpass(prompt="ebel password[ebel]:") or 'ebel' - db = input("database name[ebel]") or 'ebel' - print("If you want to setup the database automatically,\n \ - then type in the root password, otherwise nothing") - root_pwd = getpass.getpass(prompt='root password (only for 1st setup):') + 1st setup of db and user with root:\n" + ) + host = input("host[localhost]:") or "localhost" + user = input("ebel user[ebel]:") or "ebel" + password = getpass.getpass(prompt="ebel password[ebel]:") or "ebel" + db = input("database name[ebel]") or "ebel" + print( + "If you want to setup the database automatically,\n \ + then type in the root password, otherwise nothing" + ) + root_pwd = getpass.getpass(prompt="root password (only for 1st setup):") if root_pwd: - root_host = getpass.getpass(prompt='IP or name mysql server [localhost]:') or 'localhost' - conn = pymysql.connect(host=root_host, user='root', password=root_pwd) + root_host = getpass.getpass(prompt="IP or name mysql server [localhost]:") or "localhost" + conn = pymysql.connect(host=root_host, user="root", password=root_pwd) c = conn.cursor() db_exists = c.execute(f"show databases like '{db}'") @@ -152,12 +171,14 @@ def set_mysql_interactive() -> tuple: return host, user, password, db -def set_mysql_connection(host: str = 'localhost', - user: str = 'ebel_user', - password: str = 'ebel_passwd', - db: str = 'ebel', - port: Union[str, int] = '3306', - charset: str = 'utf8mb4'): +def set_mysql_connection( + host: str = "localhost", + user: str = "ebel_user", + password: str = "ebel_passwd", + db: str = "ebel", + port: Union[str, int] = "3306", + charset: str = "utf8mb4", +): """Set the connection using MySQL Parameters. Parameters @@ -181,7 +202,7 @@ def set_mysql_connection(host: str = 'localhost', SQLAlchemy MySQL connection string. """ - connection_string = f'mysql+pymysql://{user}:{password}@{host}/{db}:{port}?charset={charset}' + connection_string = f"mysql+pymysql://{user}:{password}@{host}/{db}:{port}?charset={charset}" set_connection(connection_string) return connection_string @@ -196,7 +217,7 @@ def set_always_create_new_db(always_create_new_db: bool = True) -> None: Option `always_create_new_db` in section `database` in config file. """ - write_to_config('database', 'always_create_new', str(always_create_new_db)) + write_to_config("database", "always_create_new", str(always_create_new_db)) def set_connection(connection: str = CONN_STR_DEFAULT) -> None: @@ -208,4 +229,4 @@ def set_connection(connection: str = CONN_STR_DEFAULT) -> None: SQLAlchemy connection string. """ - write_to_config('DATABASE', 'sqlalchemy_connection_string', connection) + write_to_config("DATABASE", "sqlalchemy_connection_string", connection) diff --git a/ebel/defaults.py b/ebel/defaults.py index b312259..566da81 100755 --- a/ebel/defaults.py +++ b/ebel/defaults.py @@ -2,11 +2,11 @@ """This file contains default values for configurations and parameters.""" -import os import logging import logging.handlers as handlers +import os -from .constants import PROJECT_DIR, DATA_DIR, LOG_DIR +from .constants import DATA_DIR, LOG_DIR, PROJECT_DIR ############################################################################### # UNIPROT taxonomy IDs to import @@ -16,51 +16,48 @@ 10116, # Rats 10090, # Mice 7227, # Drosophila melanogaster - 694009, 2697049, # COVID + 694009, + 2697049, # COVID 7955, # Zebrafish ] ############################################################################### -SQLITE_DATABASE_NAME = 'ebel.db' -SQLITE_TEST_DATABASE_NAME = 'ebel_test.db' -DATABASE_LOCATION = os.path.join( - DATA_DIR, - SQLITE_DATABASE_NAME -) -DEFAULT_TEST_DATABASE_LOCATION = os.path.join( - DATA_DIR, - SQLITE_TEST_DATABASE_NAME -) +SQLITE_DATABASE_NAME = "ebel.db" +SQLITE_TEST_DATABASE_NAME = "ebel_test.db" +DATABASE_LOCATION = os.path.join(DATA_DIR, SQLITE_DATABASE_NAME) +DEFAULT_TEST_DATABASE_LOCATION = os.path.join(DATA_DIR, SQLITE_TEST_DATABASE_NAME) ############################################################################### # SQLAlchemy connection strings # ============================= # SQLite # ------ -CONN_STR_DEFAULT = 'sqlite:///' + DATABASE_LOCATION -CONN_STR_TESTS = 'sqlite:///' + SQLITE_TEST_DATABASE_NAME +CONN_STR_DEFAULT = "sqlite:///" + DATABASE_LOCATION +CONN_STR_TESTS = "sqlite:///" + SQLITE_TEST_DATABASE_NAME # MySQL # ----- -CONN_STR_MYSQL_PREFIX = 'mysql+pymysql://ebel:ebel@localhost/' -CONN_STR_MYSQL = CONN_STR_MYSQL_PREFIX + 'ebel?charset=utf8' -CONN_STR_MYSQL_TESTS = CONN_STR_MYSQL_PREFIX + 'ebel_test?charset=utf8' +CONN_STR_MYSQL_PREFIX = "mysql+pymysql://ebel:ebel@localhost/" +CONN_STR_MYSQL = CONN_STR_MYSQL_PREFIX + "ebel?charset=utf8" +CONN_STR_MYSQL_TESTS = CONN_STR_MYSQL_PREFIX + "ebel_test?charset=utf8" ############################################################################### # Config -config_file_path = os.path.join(PROJECT_DIR, 'config.ini') +config_file_path = os.path.join(PROJECT_DIR, "config.ini") ############################################################################### # Log Handling -logHandler = handlers.RotatingFileHandler(filename=os.path.join(LOG_DIR, 'ebel.log'), - mode='a', - maxBytes=4098 * 10, # 4MB file max - backupCount=0) -logh_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logHandler = handlers.RotatingFileHandler( + filename=os.path.join(LOG_DIR, "ebel.log"), + mode="a", + maxBytes=4098 * 10, # 4MB file max + backupCount=0, +) +logh_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") logHandler.setFormatter(logh_format) logHandler.setLevel(logging.DEBUG) # Console Handler ch = logging.StreamHandler() -ch_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') +ch_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") ch.setFormatter(ch_format) ch.setLevel(logging.WARNING) diff --git a/ebel/errors.py b/ebel/errors.py index f3f4468..1fb52ca 100755 --- a/ebel/errors.py +++ b/ebel/errors.py @@ -1,12 +1,13 @@ """Error class definitions.""" -from lark.exceptions import UnexpectedToken, UnexpectedCharacters - -from collections import OrderedDict import re import sys +from collections import OrderedDict -TEMPLATE = '{error_class}\tkeyword:{keyword}\tentry:{entry}\tline:{line_number}\tcolumn:{column}' \ - '\turl:{url}\thint:{hint}' +from lark.exceptions import UnexpectedCharacters, UnexpectedToken + +TEMPLATE = ( + "{error_class}\tkeyword:{keyword}\tentry:{entry}\tline:{line_number}\tcolumn:{column}" "\turl:{url}\thint:{hint}" +) class _Error: @@ -14,19 +15,21 @@ class _Error: def __init__(self): self.class_name = self.__class__.__name__ - self.value_dict = OrderedDict([ - ("error_class", self.class_name), - ("url", None), - ("keyword", None), - ("entry", None), - ("line_number", None), - ("column", None), - ("hint", None) - ]) + self.value_dict = OrderedDict( + [ + ("error_class", self.class_name), + ("url", None), + ("keyword", None), + ("entry", None), + ("line_number", None), + ("column", None), + ("hint", None), + ] + ) def to_dict(self): """Format the properties of error into a dictionary.""" - raise NotImplementedError('to_dict have to be implemented in {}'.format(self.__class__.__name__)) + raise NotImplementedError("to_dict have to be implemented in {}".format(self.__class__.__name__)) def to_string(self) -> str: """Format the output to a string.""" @@ -62,7 +65,7 @@ def to_dict(self) -> dict: "keyword": self.ns_keyword, "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, - "column": self.column + "column": self.column, } @@ -91,7 +94,7 @@ def to_dict(self) -> dict: "keyword": self.ns_keyword, "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, - "column": self.column + "column": self.column, } @@ -120,7 +123,7 @@ def to_dict(self) -> dict: "keyword": self.keyword, "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, - "column": self.column + "column": self.column, } @@ -149,14 +152,22 @@ def to_dict(self) -> dict: "keyword": self.keyword, "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, - "column": self.column + "column": self.column, } class NotInNamespaceUrl(_Error): """Error in entry links to a namespace defined in the header but does not exist in namespace url.""" - def __init__(self, keyword: str, url_or_path: str, entry: str, line_number: int, column: int, hint: str): + def __init__( + self, + keyword: str, + url_or_path: str, + entry: str, + line_number: int, + column: int, + hint: str, + ): """Initialize error class. :param keyword: Error class type. @@ -184,14 +195,22 @@ def to_dict(self) -> dict: "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, "column": self.column, - "hint": self.hint + "hint": self.hint, } class NotInAnnotationUrl(_Error): """Error in entry links to an annotation defined in the header but not exists in namespace url.""" - def __init__(self, keyword: str, url_or_path: str, entry: str, line_number: int, column: int, hint: str): + def __init__( + self, + keyword: str, + url_or_path: str, + entry: str, + line_number: int, + column: int, + hint: str, + ): """Initialize error class. :param keyword: Error class type. @@ -219,7 +238,7 @@ def to_dict(self) -> dict: "entry": re.sub("[\n\r]", "", self.entry), "line_number": self.line_number, "column": self.column, - "hint": self.hint + "hint": self.hint, } @@ -299,32 +318,42 @@ def __init__(self, exception, line_number: int, line: str): def to_dict(self) -> dict: """Format the properties of error into a dictionary.""" - hint = ("%s >>>>>> %s" % (self.line[:self.exception.column - 1], - self.line[self.exception.column - 1:])).strip() + hint = ( + "%s >>>>>> %s" + % ( + self.line[: self.exception.column - 1], + self.line[self.exception.column - 1 :], + ) + ).strip() value_dict = { "line_number": self.line_number, - "hint": re.sub("[\n\r]", "", hint) + "hint": re.sub("[\n\r]", "", hint), } if isinstance(self.exception, UnexpectedCharacters): - offset = self.exception.column - value_dict.update({ - "error_class": self.__class__.__name__ + '_unexpected_input', - "entry": self.line[offset:(offset + 5)], - "column": self.exception.column, - }) + value_dict.update( + { + "error_class": self.__class__.__name__ + "_unexpected_input", + "entry": self.line[offset : (offset + 5)], + "column": self.exception.column, + } + ) elif isinstance(self.exception, UnexpectedToken): - - value_dict.update({ - "error_class": self.__class__.__name__ + '_unexpected_token', - "entry": re.sub("[\n\r]", "", self.exception.token), - "column": self.exception.column, - }) + value_dict.update( + { + "error_class": self.__class__.__name__ + "_unexpected_token", + "entry": re.sub("[\n\r]", "", self.exception.token), + "column": self.exception.column, + } + ) else: - print("Not covered by library: lark.expections type {}".format(type(self.exception)), sys.exc_info()[0]) + print( + "Not covered by library: lark.expections type {}".format(type(self.exception)), + sys.exc_info()[0], + ) raise return value_dict @@ -333,7 +362,14 @@ def to_dict(self) -> dict: class NotDownloadedFromUrl(_Error): """Error in entry links to an annotation defined in the header but not exists in namespace url.""" - def __init__(self, keyword: str, url_or_path: str, hint: str, line: int = None, column: int = None): + def __init__( + self, + keyword: str, + url_or_path: str, + hint: str, + line: int = None, + column: int = None, + ): """Initialize error class. :param keyword: Error class type. diff --git a/ebel/grammar/grammar_bel_2_1.bnf b/ebel/grammar/grammar_bel_2_1.bnf index 52d6f55..4d48ea5 100755 --- a/ebel/grammar/grammar_bel_2_1.bnf +++ b/ebel/grammar/grammar_bel_2_1.bnf @@ -46,15 +46,15 @@ WORD_WITHOUT_ESCAPED_NEW_LINE: /([^\\\n\r](?!\\[\n\r]))+/ statement: ( subject | ( subject relation_basic object ) - | ( subject biomarker_relation _molec_process ) - | ( pat process_relation _molec_process ) - | ( gene orthologous_relation gene ) + | ( subject biomarker_relation molec_process ) + | ( pat process_relation molec_process ) + | ( ortho_subject orthologous_relation ortho_object ) | ( _basic_subobj analogous_relation _basic_subobj ) | ( transc_subject transc_relation transc_object ) | ( transl_subject transl_relation transl_object ) | ( subject relation_basic "(" nested_subject nested_relation nested_object ")" ) - | ( complex_abundance has_component object ) - | ( complex_abundance has_components list ) + | ( complex_obj has_component object ) + | ( complex_obj has_components list ) | ( subject has_members list )) [ statement_comment ] _EOL transc_relation: transcribed_to @@ -69,13 +69,17 @@ transc_subject: gene transc_object: rna transl_subject: rna transl_object: protein +ortho_subject: gene +ortho_object: gene bio_obj: protein | rna | gene +complex_obj: complex_abundance | complex_list + subject: _sub_obj object: _sub_obj -molecular_process: _molec_process +molec_process: _molec_process _molec_process: path | bp nested_subject: _sub_obj @@ -268,7 +272,6 @@ UNDERSCORE: "_" // ref: http://openbel.org/language/version_2.0/bel_specification_version_2.0.html#XcomplexA // TODO: add to complex_abundance location ! http://openbel.org/language/version_2.0/bel_specification_version_2.0.html#_location_loc -// complex_abundance: _complex_function "(" nn [ loc ] ")" should be the solution complex_abundance: _complex_function "(" nn [ loc ] ")" complex_list: _complex_function "(" list_complex ")" _complex_function: "complexAbundance" | "complex" diff --git a/ebel/manager/models.py b/ebel/manager/models.py index 13d3681..719860b 100755 --- a/ebel/manager/models.py +++ b/ebel/manager/models.py @@ -1,31 +1,28 @@ """This module contains the SQLAlchemy database models that support the definition cache and graph cache.""" +import codecs +import datetime +import logging import os import re import urllib -import codecs -import logging +from typing import List, Optional, Tuple, Union +from urllib.parse import quote_plus, urlencode + +import pandas as pd import requests -import datetime import sqlalchemy -import pandas as pd - -from tqdm import tqdm -from lark import Lark -from urllib.parse import urlencode, quote_plus -from typing import List, Tuple, Optional, Union - +from lark import Lark, Token, Tree +from sqlalchemy import Boolean, Column, ForeignKey, Index, Integer, String +from sqlalchemy.ext.declarative import declarative_base, declared_attr from sqlalchemy.orm import relationship from sqlalchemy.sql.expression import func -from sqlalchemy.ext.declarative import declarative_base, declared_attr -from sqlalchemy import Column, ForeignKey, Integer, String, Index, Boolean - from sqlalchemy_utils import create_database, database_exists +from tqdm import tqdm -from ebel import parser +from ebel.constants import (FILE, GRAMMAR_NS_ANNO_PATH, GRAMMAR_START_ANNO, + GRAMMAR_START_NS, URL) from ebel.tools import BelRdb -from ebel.constants import GRAMMAR_NS_ANNO_PATH, GRAMMAR_START_NS, GRAMMAR_START_ANNO, URL, FILE - Base = declarative_base() logger = logging.getLogger(__name__) @@ -57,7 +54,7 @@ def foreign_key_to(table_name): :return: foreign key column :rtype: sqlalchemy.Column """ - foreign_column = table_name + '.id' + foreign_column = table_name + ".id" return Column(Integer, ForeignKey(foreign_column)) @@ -72,18 +69,18 @@ def __tablename__(self): """Return name of class table.""" return self.__name__.lower() - __mapper_args__ = {'always_refresh': True} + __mapper_args__ = {"always_refresh": True} id = Column(Integer, primary_key=True) def _to_dict(self): """Protected method for converting values to dictionary.""" data_dict = self.__dict__.copy() - del data_dict['_sa_instance_state'] - del data_dict['id'] + del data_dict["_sa_instance_state"] + del data_dict["id"] for k, v in data_dict.items(): if isinstance(v, datetime.date): - data_dict[k] = data_dict[k].strftime('%Y-%m-%d') + data_dict[k] = data_dict[k].strftime("%Y-%m-%d") return data_dict def to_dict(self): @@ -114,8 +111,8 @@ class NamespaceEntry(Base, MasterModel): name = Column(String(2048), nullable=True) encoding = Column(String(8), nullable=True) - namespace__id = foreign_key_to('namespace') - namespace = relationship('Namespace', back_populates="entries") + namespace__id = foreign_key_to("namespace") + namespace = relationship("Namespace", back_populates="entries") class Annotation(Base, MasterModel): @@ -141,8 +138,8 @@ class AnnotationEntry(Base, MasterModel): name = Column(String(2048), nullable=True) identifier = Column(String(255), nullable=True) - annotation__id = foreign_key_to('annotation') - annotation = relationship('Annotation', back_populates="entries") + annotation__id = foreign_key_to("annotation") + annotation = relationship("Annotation", back_populates="entries") class ModelManager: @@ -163,8 +160,9 @@ def __init__(self, model, entries_model, grammar_start): self.errors = [] - def get_entries_not_exists(self, keyword: str, url: str, - entry_line_column_list) -> List[Tuple[str, int, int, str]]: + def get_entries_not_exists( + self, keyword: str, url: str, entry_line_column_list + ) -> List[Tuple[str, int, int, str]]: """Get entries in namespace or annotation linked to URL for entry_line_column_list(generator). Parameters @@ -189,50 +187,48 @@ def get_entries_not_exists(self, keyword: str, url: str, search_for = self.session.query(self.model.id).filter(self.model.keyword == keyword, self.model.url == url) - desc = 'Check BEL for {}: '.format(keyword) + desc = "Check BEL for {}: ".format(keyword) for entry, line, column in tqdm(list(entry_line_column_list), desc=desc, ncols=100): - if entry in not_exists_cache: hint = not_exists_cache[entry] names_not_exists.append((entry, line, column, hint)) elif (keyword, url, entry) not in exists_cache: - exists = search_for.join(self.entries_model).filter(self.entries_model.name == entry).count() if exists: exists_cache.update(set([(keyword, url, entry)])) else: hint = "" - alternatives = self.session.query( - self.entries_model.name, - self.model.keyword, - self.model.url - ).join(self.model).filter(self.entries_model.name.like(entry)).all() + alternatives = ( + self.session.query(self.entries_model.name, self.model.keyword, self.model.url) + .join(self.model) + .filter(self.entries_model.name.like(entry)) + .all() + ) if alternatives: hint = "Did you mean: " - hint += ", ".join([x[1] + ":\"" + x[0] + "\"(" + x[2] + ")" for x in alternatives]) + hint += ", ".join([x[1] + ':"' + x[0] + '"(' + x[2] + ")" for x in alternatives]) else: if len(entry) >= 6: - similars = self.session.query( - self.entries_model.name, - self.model.keyword - )\ - .join(self.model)\ - .filter(self.entries_model.name.like(entry[:-2] + "%"))\ - .filter(func.length(self.entries_model.name) < len(entry) + 3)\ - .limit(20)\ + similars = ( + self.session.query(self.entries_model.name, self.model.keyword) + .join(self.model) + .filter(self.entries_model.name.like(entry[:-2] + "%")) + .filter(func.length(self.entries_model.name) < len(entry) + 3) + .limit(20) .all() + ) if similars: hint = "Similar: " - hint += ", ".join([x[1] + ":\"" + x[0] + "\"" for x in set(similars)]) + hint += ", ".join([x[1] + ':"' + x[0] + '"' for x in set(similars)]) if not hint: - url_query_string = urlencode({'q': entry}, quote_via=quote_plus) + url_query_string = urlencode({"q": entry}, quote_via=quote_plus) hint = "[OLS suggests](https://www.ebi.ac.uk/ols/search?%s)" % url_query_string names_not_exists.append((entry, line, column, hint)) @@ -262,7 +258,7 @@ def download_url(url: str) -> Tuple[bool, Union[Exception, str]]: r = requests.get(url) r.raise_for_status() - open(path_to_file, 'wb').write(r.content) + open(path_to_file, "wb").write(r.content) except requests.exceptions.HTTPError as ex: return False, ex @@ -271,7 +267,10 @@ def download_url(url: str) -> Tuple[bool, Union[Exception, str]]: return False, ex except FileNotFoundError as urlex: - return False, f"{str(urlex)}\n{url} does not return a valid belns or belanno file" + return ( + False, + f"{str(urlex)}\n{url} does not return a valid belns or belanno file", + ) return True, path_to_file @@ -292,7 +291,7 @@ def get_namespace_header(file_path: str) -> Tuple[str, int]: """ header = "" ends_in_line = 0 - with codecs.open(file_path, 'r', encoding="utf-8") as fd: + with codecs.open(file_path, "r", encoding="utf-8") as fd: for line in fd: ends_in_line += 1 if not re.search(r"^[ \t]*(\[Values\])\s*(\r\n|\r|\n)", line): @@ -322,7 +321,6 @@ def save_from_url_or_path(self, keyword: str, url_or_path: str, doc_type: str): path_to_file = None if doc_type == URL: - downloaded, path_to_file_or_error = self.download_url(url_or_path) if not downloaded: @@ -333,7 +331,6 @@ def save_from_url_or_path(self, keyword: str, url_or_path: str, doc_type: str): path_to_file = path_to_file_or_error elif doc_type == FILE: - path_to_file = url_or_path saved, save_error = self.save_in_db(path_to_file=path_to_file, url=url_or_path, keyword=keyword) @@ -360,7 +357,7 @@ def save_in_db(self, path_to_file: str, url: str, keyword: str) -> Tuple[bool, O type Description of returned object. """ - grammar = parser.load_grammar(GRAMMAR_NS_ANNO_PATH) + grammar = load_grammar(GRAMMAR_NS_ANNO_PATH) header, header_ends_in_line = self.get_namespace_header(path_to_file) lark_parser = Lark(grammar, start=self.grammar_start) @@ -370,39 +367,42 @@ def save_in_db(self, path_to_file: str, url: str, keyword: str) -> Tuple[bool, O except Exception as e: return False, e - delimiter = parser.first_token_value(tree, 'pr_delimiter_string') - case_sensitive = parser.first_token_value(tree, 'pr_case_sensitive_flag') - cacheable = parser.first_token_value(tree, 'pr_cacheable_flag') + delimiter = first_token_value(tree, "pr_delimiter_string") + case_sensitive = first_token_value(tree, "pr_case_sensitive_flag") + cacheable = first_token_value(tree, "pr_cacheable_flag") is_case_sensitive = False if re.search("no", case_sensitive, re.I) else False is_cacheable = False if re.search("no", cacheable, re.I) else True - keyword_in_anno = parser.first_token_value(tree, 'keyword') + keyword_in_anno = first_token_value(tree, "keyword") if keyword != keyword_in_anno: warning = f"Keyword {keyword} in BEL namespace URL {url} is different from keyword in BEL script" logger.warning(warning) # ToDo save in Error classes self.errors += [warning] - model_instance = self.model(url=url, - keyword=keyword, - cacheable=is_cacheable, - case_sensitive=is_case_sensitive) + model_instance = self.model( + url=url, + keyword=keyword, + cacheable=is_cacheable, + case_sensitive=is_case_sensitive, + ) self.session.add(model_instance) self.session.commit() - table_name = self.name + '_entry' - second = {'annotation': 'identifier', 'namespace': 'encoding'} + table_name = self.name + "_entry" + second = {"annotation": "identifier", "namespace": "encoding"} second_column = second[self.name] - df = pd.read_csv(path_to_file, - delimiter=delimiter, - skip_blank_lines=True, - skiprows=header_ends_in_line, - names=['name', second_column], - encoding_errors="ignore", - ) - df[self.name + '__id'] = model_instance.id - df.set_index(['name', second_column], inplace=True) + df = pd.read_csv( + path_to_file, + delimiter=delimiter, + skip_blank_lines=True, + skiprows=header_ends_in_line, + names=["name", second_column], + encoding_errors="ignore", + ) + df[self.name + "__id"] = model_instance.id + df.set_index(["name", second_column], inplace=True) logger.info(f"Import `{keyword}` table '{table_name}' of engine '{self.session.bind.engine}'") @@ -426,9 +426,7 @@ def keyword_url_exists(self, keyword: str, url: str) -> bool: Description of returned object. """ - result = self.session.query(self.model).filter( - self.model.keyword == keyword, - self.model.url == url) + result = self.session.query(self.model).filter(self.model.keyword == keyword, self.model.url == url) if result.count() == 0: exists = False @@ -449,9 +447,11 @@ class NamespaceManager(ModelManager): def __int__(self, session_obj): """Init method.""" - super(NamespaceManager, self).__int__(model=Namespace, - entries_model=NamespaceEntry, - grammar_start=GRAMMAR_START_NS) + super().__init__( + model=Namespace, + entries_model=NamespaceEntry, + grammar_start=GRAMMAR_START_NS, + ) class AnnotationManager(ModelManager): @@ -459,6 +459,53 @@ class AnnotationManager(ModelManager): def __int__(self, session_obj): """Init method.""" - super(AnnotationManager, self).__int__(model=Annotation, - entries_model=AnnotationEntry, - grammar_start=GRAMMAR_START_ANNO) + super().__init__( + model=Annotation, + entries_model=AnnotationEntry, + grammar_start=GRAMMAR_START_ANNO, + ) + + +def load_grammar(grammar_path): + """Return eBNF grammar in lark style. + + Parameters + ---------- + grammar_path : str + path to eBNF grammar in lark style. + + Returns + ------- + string + eBNF grammar in lark style. + + """ + # FIXME: something to do here + logger.info("load grammar {}".format(grammar_path)) + with codecs.open(grammar_path, "r", encoding="utf-8") as fd_grammar: + grammar = fd_grammar.read() + + return grammar + + +def first_token_value(tree: Tree, subtree_name: str) -> str: + """Get the first token value of Lark tree with subtree name. + + Parameters + ---------- + tree : type + Description of parameter `tree`. + subtree_name : type + Description of parameter `subtree_name`. + + Returns + ------- + type + Description of returned object. + + """ + # TODO: Get rid of this method by using a Transformer? Is this possible? + + for subtree in tree.iter_subtrees(): + if subtree.data == subtree_name: + return [node.value for node in subtree.children if isinstance(node, Token)][0] diff --git a/ebel/manager/molecular_interaction.py b/ebel/manager/molecular_interaction.py index f138fd7..6b9c2de 100644 --- a/ebel/manager/molecular_interaction.py +++ b/ebel/manager/molecular_interaction.py @@ -1,553 +1,1503 @@ """PSI-MI 25 Molecular interaction types (https://www.ebi.ac.uk/ols/ontologies/mi).""" -types = {0: 'molecular interaction', 1: 'interaction detection method', 2: 'participant identification method', - 3: 'feature detection method', 4: 'affinity chromatography technology', 5: 'alanine scanning', - 6: 'anti bait coimmunoprecipitation', 7: 'anti tag coimmunoprecipitation', 8: 'array technology', - 9: 'bacterial display', 10: 'beta galactosidase complementation', 11: 'beta lactamase complementation', - 12: 'bioluminescence resonance energy transfer', 13: 'biophysical', 14: 'adenylate cyclase complementation', - 16: 'circular dichroism', 17: 'classical fluorescence spectroscopy', 18: 'two hybrid', - 19: 'coimmunoprecipitation', - 20: 'transmission electron microscopy', 21: 'colocalization by fluorescent probes cloning', - 22: 'colocalization by immunostaining', 23: 'colocalization/visualisation technologies', - 24: 'confirmational text mining', 25: 'copurification', 26: 'correlated mutations', 27: 'cosedimentation', - 28: 'cosedimentation in solution', 29: 'cosedimentation through density gradient', 30: 'cross-linking study', - 31: 'protein cross-linking with a bifunctional reagent', - 32: 'de novo protein sequencing by mass spectrometry', - 33: 'deletion analysis', 34: 'display technology', 35: 'docking', 36: 'domain fusion', - 37: 'domain profile pairs', - 38: 'dynamic light scattering', 39: 'edman degradation', 40: 'electron microscopy', - 41: 'electron nuclear double resonance', 42: 'electron paramagnetic resonance', 43: 'electron resonance', - 45: 'experimental interaction detection', 46: 'experimental knowledge based', 47: 'far western blotting', - 48: 'filamentous phage display', 49: 'filter binding', 50: 'flag tag coimmunoprecipitation', - 51: 'fluorescence technology', 52: 'fluorescence correlation spectroscopy', - 53: 'fluorescence polarization spectroscopy', 54: 'fluorescence-activated cell sorting', - 55: 'fluorescent resonance energy transfer', 56: 'full identification by DNA sequencing', - 57: 'gene neighbourhood', - 58: 'genome based prediction', 59: 'gst pull down', 60: 'ha tag coimmunoprecipitation', 61: 'his pull down', - 62: 'his tag coimmunoprecipitation', 63: 'interaction prediction', 64: 'interologs mapping', - 65: 'isothermal titration calorimetry', 66: 'lambda phage display', 67: 'light scattering', - 68: 'mass detection of residue modification', 69: 'mass spectrometry studies of complexes', - 70: 'mobility shift', - 71: 'molecular sieving', 72: 'monoclonal antibody western blot', 73: 'mrna display', 74: 'mutation analysis', - 75: 'myc tag coimmunoprecipitation', 76: 'neural network on interface properties', - 77: 'nuclear magnetic resonance', - 78: 'nucleotide sequence identification', 79: 'other biochemical technologies', - 80: 'partial DNA sequence identification by hybridization', 81: 'peptide array', - 82: 'peptide massfingerprinting', - 83: 'peptide synthesis', 84: 'phage display', 85: 'phylogenetic profile', - 86: 'polyclonal antibody western blot', - 87: 'predictive text mining', 88: 'primer specific pcr', 89: 'protein array', - 90: 'protein complementation assay', - 91: 'chromatography technology', 92: 'protein in situ array', 93: 'protein sequence identification', - 94: 'protein staining', 95: 'proteinchip(r) on a surface-enhanced laser desorption/ionization', - 96: 'pull down', - 97: 'reverse ras recruitment system', 98: 'ribosome display', 99: 'scintillation proximity assay', - 100: 'sequence based phylogenetic profile', 101: 'sequence based prediction', - 102: 'sequence tag identification', - 103: 'southern blot', 104: 'static light scattering', 105: 'structure based prediction', - 106: 'surface patches', - 107: 'surface plasmon resonance', 108: 't7 phage display', 109: 'tap tag coimmunoprecipitation', - 110: 'text mining', - 111: 'dihydrofolate reductase reconstruction', 112: 'ubiquitin reconstruction', 113: 'western blot', - 114: 'x-ray crystallography', 115: 'yeast display', 116: 'feature type', 117: 'binding-associated region', - 118: 'mutation', 119: 'mutation decreasing interaction', 120: 'post translation modification', - 121: 'acetylated residue', 122: 'n-acetyl-alanine', 123: 'n2-acetyl-arginine', 124: 'n-acetyl-asparagine', - 125: 'n-acetyl-aspartic acid', 126: 'n-acetyl-cysteine', 127: 'n-acetyl-glutamine', - 128: 'n-acetyl-glutamic acid', - 129: 'n-acetylglycine', 130: 'n-acetyl-histidine', 131: 'n-acetyl-isoleucine', 132: 'n-acetyl-leucine', - 133: 'n2-acetyl-lysine', 134: 'n6-acetyl-lysine', 135: 'n-acetyl-methionine', 136: 'n-acetyl-phenylalanine', - 137: 'n-acetyl-proline', 138: 'n-acetyl-serine', 139: 'n-acetyl-threonine', 140: 'n-acetyl-tryptophan', - 141: 'n-acetyl-tyrosine', 142: 'n-acetyl-valine', 143: 'amidated residue', 145: 'arginine amide', - 146: 'formylated residue', - 147: 'n-formyl-methionine', 148: 'hydroxylated residue', 150: 'lipid modification', - 151: 's-farnesyl-cysteine', 152: 's-geranylgeranyl-cysteine', 153: 'n-palmitoyl-cysteine', - 154: 's-palmitoyl-cysteine', - 155: 'n-myristoyl-glycine', 156: 'n6-myristoyl-lysine', 157: 'methylated residue', 158: 'n-methyl-alanine', - 159: 'n,n,n-trimethyl-alanine', 160: 'omega-n,omega-n-dimethyl-arginine', - 161: 'beta-methylthioaspartic acid', - 162: 'n5-methyl-glutamine', 163: 'glutamic acid 5-methyl ester', 165: 'n6-methyl-lysine', - 166: 'n6,n6-dimethyl-lysine', - 167: 'n6,n6,n6-trimethyl-lysine', 168: 'n-methyl-methionine', 169: 'n-methyl-phenylalanine', - 170: 'phosphorylated residue', 171: 'omega-n-phospho-arginine', 172: 'aspartic 4-phosphoric anhydride', - 173: 's-phospho-cysteine', 176: 'o-phospho-serine', 177: 'o-phospho-threonine', 178: "o4'-phospho-tyrosine", - 179: 'other modification', 180: 'selenocysteine', 181: 'selenomethionine', 182: '3-oxoalanine', - 184: 'glutamyl 5-glycerylphosphorylethanolamine', 186: 'n6-biotinyl-lysine', - 187: 'n6-(4-amino-2-hydroxybutyl)-lysine', - 188: 'n6-retinal-lysine', 189: 'ubiquitinated lysine', 190: 'interaction type', 191: 'aggregation', - 192: 'acetylation reaction', 193: 'amidation reaction', 194: 'cleavage reaction', 195: 'covalent binding', - 196: 'covalent interaction', 197: 'deacetylation reaction', 198: 'defarnesylation reaction', - 199: 'deformylation reaction', 200: 'degeranylation reaction', 201: 'demyristoylation reaction', - 202: 'depalmitoylation reaction', 203: 'dephosphorylation reaction', 204: 'deubiquitination reaction', - 205: 'disaggregation', 206: 'farnesylation reaction', 207: 'formylation reaction', - 208: 'genetic interaction', - 209: 'geranylgeranylation reaction', 210: 'hydroxylation reaction', 211: 'lipid addition', - 212: 'lipoprotein cleavage reaction', 213: 'methylation reaction', 214: 'myristoylation reaction', - 215: 'non covalent interaction', 216: 'palmitoylation reaction', 217: 'phosphorylation reaction', - 218: 'physical interaction', 219: 'synthetic lethal', 220: 'ubiquitination reaction', - 221: 'expression level', - 222: 'physiological level', 223: 'under expressed level', 225: 'chromatin immunoprecipitation array', - 226: 'ion exchange chromatography', 227: 'reverse phase chromatography', - 228: 'cytoplasmic complementation assay', - 230: 'membrane bound complementation assay', 231: 'mammalian protein protein interaction trap', - 232: 'transcriptional complementation assay', 233: 'protein dna complex', 234: '131i radiolabel', - 235: '14c radiolabel', 236: '32p radiolabel', 237: '33p radiolabel', 238: '3h radiolabel', - 239: 'biotin tag', - 240: 'fusion protein', 241: 'horseradish peroxidase tag', 242: 'gene ontology definition reference', - 243: 'isoform parent sequence reference', 244: 'reactome complex', 245: 'reactome protein', 246: 'cabri', - 247: 'newt', - 248: 'resid', 249: 'huge', 250: 'gene', 251: 'gene product', 252: 'biological feature', - 253: 'isotope label', - 254: 'genetic interference', 255: 'post transcriptional interference', 256: 'rna interference', - 257: 'antisense rna', - 258: 'inhibitor antibodies', 259: 'perturbagens peptides', 260: 'inhibitor small molecules', - 261: 'suppression', - 262: 'suppression mutation', 263: 'suppression knockout', 264: 'suppression partial alteration', - 265: 'suppression expression alteration', 266: 'suppression overexpression', 267: 'suppression scalable', - 268: 'suppression underexpression', 269: 'synthetic phenotype', 270: 'conditional synthetic lethal', - 271: 'conditional synthetic lethal temperature-sensitivity', 273: 'synthetic growth effect', - 274: 'synthetic growth defect', 275: 'synthetic growth increase', 276: 'blue native page', - 300: 'alias type', - 301: 'gene name', 302: 'gene name synonym', 303: 'gene ontology synonym', 304: 'isoform synonym', - 305: 'ordered locus name', 306: 'open reading frame name', 307: 'delivery method', 308: 'electroporation', - 309: 'genomic tagging', 310: 'infection', 311: 'microinjection', 312: 'nucleic acid transfection', - 313: 'interactor type', 314: 'complex', 315: 'protein complex', 316: 'ribonucleoprotein complex', - 317: 'interaction', - 318: 'nucleic acid', 319: 'deoxyribonucleic acid', 320: 'ribonucleic acid', 321: 'catalytic rna', - 322: 'guide rna', - 323: 'heterogeneous nuclear rna', 324: 'messenger rna', 325: 'transfer rna', 326: 'protein', - 327: 'peptide', - 328: 'small molecule', 329: 'unknown participant', 330: 'molecular source', 331: 'engineered', - 332: 'naturally occurring', 333: 'feature range status', 334: 'c-terminal position', - 335: 'certain sequence position', - 336: 'greater-than', 337: 'less-than', 338: 'range', 339: 'undetermined sequence position', - 340: 'n-terminal position', - 341: 'ragged n-terminus', 342: 'sample process', 343: 'cdna library', 344: 'cell lysate', - 345: 'author assigned name', - 346: 'experimental preparation', 348: 'fixed cell', 349: 'living cell', 350: 'purified', - 351: 'homogeneous', - 352: 'partially purified', 353: 'cross-reference type', 354: 'cellular component', - 355: 'molecular function', - 356: 'identical object in an external resource', 357: 'method reference', 358: 'primary-reference', - 359: 'biological process', 360: 'secondary accession number', 361: 'additional information', - 362: 'inference', - 363: 'inferred by author', 364: 'inferred by curator', 365: 'enzyme tag', 366: 'alkaline phosphatase tag', - 367: 'green fluorescent protein tag', 368: 'yellow fluorescent protein tag', - 369: 'lex-a dimerization assay', - 370: 'tox-r dimerization assay', 371: '35s radiolabel', 372: 'subcellular preparation', 373: 'dye label', - 374: 'cyanine label', 375: 'cy3 label', 376: 'cy5 label', 377: 'fluorescein isothiocyanate label', - 378: 'rare isotope label', 379: '13c label', 380: '15n label', 381: '2h label', - 382: 'mutation increasing interaction', - 383: 'biopolymer', 384: 'alexa label', 385: 'alexa 350 label', 386: 'alexa 430 label', - 387: 'alexa 488 label', - 388: 'alexa 532 label', 389: 'alexa 546 label', 390: 'alexa 568 label', 391: 'alexa 594 label', - 396: 'predetermined participant', 397: 'two hybrid array', 398: 'two hybrid pooling approach', - 399: 'two hybrid fragment pooling approach', 400: 'affinity technology', 401: 'biochemical', - 402: 'chromatin immunoprecipitation assay', 403: 'colocalization', - 404: 'comigration in non denaturing gel electrophoresis', 405: 'competition binding', - 406: 'deacetylase assay', - 407: 'direct interaction', 408: 'disulfide bond', 409: 'dna footprinting', 410: 'electron tomography', - 411: 'enzyme linked immunosorbent assay', 412: 'electrophoretic mobility supershift assay', - 413: 'electrophoretic mobility shift assay', 414: 'enzymatic reaction', 415: 'enzymatic study', - 416: 'fluorescence microscopy', 417: 'footprinting', 418: 'genetic', 419: 'gtpase assay', - 420: 'kinase homogeneous time resolved fluorescence', 421: 'identification by antibody', - 422: 'immunostaining', - 423: 'in-gel kinase assay', 424: 'protein kinase assay', 425: 'kinase scintillation proximity assay', - 426: 'light microscopy', 427: 'Identification by mass spectrometry', 428: 'imaging technique', - 429: 'necessary binding region', 430: 'nucleic acid uv cross-linking assay', 431: 'obsolete', - 432: 'one hybrid', - 433: 'partial identification of protein sequence', 434: 'phosphatase assay', 435: 'protease assay', - 436: 'protein footprinting', 437: 'protein three hybrid', 438: 'rna three hybrid', - 439: 'random spore analysis', - 440: 'saturation binding', 441: 'synthetic genetic analysis', 442: 'sufficient binding region', - 443: 'ubiquitin binding', 444: 'database citation', 445: 'literature database', 446: 'pubmed', - 447: 'feature database', - 448: 'gene ontology', 449: 'interpro', 450: 'cdd', 451: 'pfam', 452: 'pirsf', 453: 'prints', 454: 'prodom', - 455: 'prosite', 456: 'scop superfamily', 457: 'smart', 458: 'tigrfams', 459: 'mmdb', 460: 'rcsb pdb', - 461: 'interaction database', 462: 'bind', 463: 'biogrid', 464: 'cygd', 465: 'dip', 466: 'ecocyc', - 467: 'reactome', - 468: 'hprd', 469: 'intact', 470: 'kegg', 471: 'mint', 472: 'pdbe', 473: 'participant database', 474: 'chebi', - 475: 'ddbj/embl/genbank', 476: 'ensembl', 477: 'entrez gene/locuslink', 478: 'flybase', 479: 'mgd/mgi', - 480: 'omim', - 481: 'refseq', 482: 'rfam', 483: 'rgd', 484: 'sgd', 485: 'uniparc', 486: 'uniprot knowledge base', - 487: 'wormbase', - 488: 'psi-mi', 489: 'source database', 490: 'experiment condition', 491: 'in silico', 492: 'in vitro', - 493: 'in vivo', - 494: 'in situ', 495: 'experimental role', 496: 'bait', 497: 'neutral component', 498: 'prey', - 499: 'unspecified role', - 500: 'biological role', 501: 'enzyme', 502: 'enzyme target', 503: 'self', 505: 'experimental feature', - 506: 'over expressed level', 507: 'tag', 508: 'deacetylase radiometric assay', - 509: 'phosphatase homogeneous time resolved fluorescence', 510: 'homogeneous time resolved fluorescence', - 511: 'protease homogeneous time resolved fluorescence', 512: 'zymography', 513: 'collagen film assay', - 514: 'in gel phosphatase assay', 515: 'methyltransferase assay', 516: 'methyltransferase radiometric assay', - 517: 'radiolabel', 518: 'flag tag', 519: 'glutathione s tranferase tag', 520: 'ha tag', 521: 'his tag', - 522: 'myc tag', - 523: 't7 tag', 524: 'calmodulin binding peptide plus protein a tag', 525: 'v5 tag', 526: 'n-acetyl-lysine', - 527: 'adp ribosylated residue', 528: 'omega-n-(adp-ribosyl)-arginine', 529: 's-(adp-ribosyl)-cysteine', - 530: 'glutamyl-5-poly(adp-ribose)', 531: 'o-(adp-ribosyl)-serine', 532: 'n4-(adp-ribosyl)-asparagine', - 533: 'glycosylated residue', 534: 'glycosyl-cysteine', 535: 'glycosyl-serine', 536: 'glycosyl-threonine', - 537: 'omega-n-glycosyl-arginine', 538: 'n4-glycosyl-asparagine', 539: 'gpi anchor residue', - 540: 'gpi-anchor amidated alanine', 541: 'gpi-anchor amidated asparagine', - 542: 'gpi-anchor amidated aspartate', - 543: 'gpi-anchor amidated cysteine', 544: 'gpi-anchor amidated glycine', 545: 'gpi-anchor amidated serine', - 546: 'gpi-anchor amidated threonine', 547: 's-prenyl-cysteine', 548: 'methylated-lysine', - 549: 'alkylated cysteine', - 550: 'gamma-carboxyglutamic acid', 551: 'nitro-tyrosine', 552: 's-nitrosyl-cysteine', - 553: "o4'-sulfo-tyrosine", - 554: 'sumoylated lysine', 555: 'phospho-histidine', 556: 'transglutamination reaction', - 557: 'adp ribosylation reaction', 558: 'deglycosylation reaction', 559: 'glycosylation reaction', - 560: 'myristoylated residue', 561: 'palmitoylated residue', 562: 'methylated alanine', - 563: 'methylated arginine', - 564: 'omega-n-methyl-arginine', 565: 'neddylated lysine', 566: 'sumoylation reaction', - 567: 'neddylation reaction', - 568: 'desumoylation reaction', 569: 'deneddylation reaction', 570: 'protein cleavage', 571: 'mrna cleavage', - 572: 'dna cleavage', 573: 'mutation disrupting interaction', 574: 'digital object identifier', - 575: 'alliance for cellular signaling', 576: 'structural proximity', - 577: 'feature prediction from structure', - 578: 'maltose binding protein tag', 579: 'electron donor', 580: 'electron acceptor', 581: 'suppressor gene', - 582: 'suppressed gene', 583: 'fluorescence donor', 584: 'fluorescence acceptor', 585: 'intenz', - 586: 'inhibitor', - 587: 'inhibited', 588: 'three hybrid', 589: 'in vitro translated protein', 590: 'attribute name', - 591: 'experiment description', 592: 'ipfam', 593: 'translocation', 594: 'translocation start', - 595: 'translocation end', 596: 'experimental form description', 597: 'feature description', - 598: 'feature constraint', - 599: 'figure legend', 600: 'conditional synthetic lethal nutrition-sensitivity', 601: 'sequence ontology', - 602: 'chemical footprinting', 603: 'dimethylsulphate footprinting', - 604: 'potassium permanganate footprinting', - 605: 'enzymatic footprinting', 606: 'DNase I footprinting', 607: 'small nuclear rna', 608: 'ribosomal rna', - 609: 'small nucleolar rna', 610: 'small interfering rna', 611: 'signal recognition particle rna', - 612: 'comment', - 613: 'function', 614: 'url', 615: 'search-url', 616: 'example', 617: 'disease', 618: 'caution', - 619: 'pathway', - 620: 'search-url-ascii', 621: 'author-confidence', 622: 'confidence-mapping', 623: 'inhibition', - 624: 'stimulant', - 625: 'agonist', 626: 'antagonist', 627: 'experiment modification', 628: 'validation regular expression', - 629: 'complex-properties', 630: '3d-structure', 631: '3d-r-factors', 632: '3d-resolution', - 633: 'data-processing', - 634: 'contact-email', 635: 'contact-comment', 636: 'author-list', 637: 'isoform-comment', - 638: 'prerequisite-ptm', - 639: 'resulting-ptm', 640: 'parameter type', 641: 'ic50', 642: 'ec50', 643: 'ki', 644: 'km', 645: 'kcat', - 646: 'Kd', - 647: 'parameter unit', 648: 'molar', 649: 'second', 650: 'millimolar', 651: 'micromolar', 652: 'nanomolar', - 653: 'picomolar', 654: 'fentomolar', 655: 'lambda repressor two hybrid', 656: 'identified peptide', - 657: 'systematic evolution of ligands by exponential enrichment', - 658: 'multidimensional protein identification technology', 659: 'experimental feature detection', - 660: 'feature prediction', 661: 'experimental participant identification', 662: 'imex-primary', - 663: 'confocal microscopy', 664: 'interaction attribute name', 665: 'experiment attibute name', - 666: 'participant attribute name', 667: 'controlled vocabulary attribute name', - 668: 'feature attribute name', - 669: 'organism attribute name', 670: 'imex', 671: 'antibodies', 672: 'library-used', 673: 'complex synonym', - 674: 'peptide parent sequence reference', 675: 'international protein index', - 676: 'tandem affinity purification', - 677: 'tandem tag', 678: 'antibody array', 679: 'poly adenine', 680: 'single stranded deoxyribonucleic acid', - 681: 'double stranded deoxyribonucleic acid', 682: 'cofactor', 683: 'sequence database', 684: 'ancillary', - 685: 'source reference', 686: 'unspecified method', 687: 'fluorescent protein tag', - 688: 'dna binding domain tag', - 689: 'activation domain tag', 690: 'gal4 activation domain', 691: 'vp16 activation domain', - 692: 'b42 activation domain', 693: 'gal4 dna binding domain', 694: 'lexa dna binding domain', - 695: 'sandwich immunoassay', 696: 'polymerase assay', 697: 'dna directed dna polymerase assay', - 698: 'dna directed rna polymerase assay', 699: 'rna directed dna polymerase assay', - 700: 'rna directed rna polymerase assay', 701: 'dna strand elongation', 702: 'panther', 703: 'gene3d', - 704: 'nucleic acid delivery', 705: 'anti tag western blot', 706: 'nucleic acid transformation', - 707: 'anti tag immunostaining', 708: 'monoclonal antibody immunostaining', - 709: 'polyclonal antibody immunostaining', - 710: 'nucleic acid transformation by treatment with divalent cation', 711: 'nucleic acid electroporation', - 712: 'nucleic acid microinjection', 713: 'nucleic acid passive uptake', 714: 'nucleic acid transduction', - 715: 'nucleic acid conjugation', 716: 'passive uptake', 717: 'nucleic acid transfection with liposome', - 718: 'nucleic acid transfection by treatment', 719: 'calcium phosphate nucleic acid transfection', - 720: 'nucleic acid delivery by infection', 721: 'protein delivery', 722: 'protein electroporation', - 723: 'protein microinjection', 724: 'protein delivery by cationic lipid treatment', - 725: 'protein delivery by infection', 726: 'reverse two hybrid', 727: 'lexa b52 complementation', - 728: 'gal4 vp16 complementation', 729: 'luminescence based mammalian interactome mapping', 730: 'pubchem', - 731: '3d repertoire', 732: 'red fluorescent protein tag', 733: 'cyan fluorescent protein tag', - 734: 'enhanced green fluorescent protein tag', 735: 'transactivating tag', 736: 'protein passive uptake', - 737: 'peptide sequence database', 738: 'pride', 739: 'penetrating tag', 740: 'cell penetrating peptide tag', - 741: 'peptide atlas', 742: 'gpm', 787: 'genetic experimental form', 788: 'knock out', 789: 'knock down', - 790: 'hypermorph', 791: 'hypomorph', 792: 'antimorph', 793: 'amorph', 794: 'synthetic', 795: 'asynthetic', - 796: 'suppression', 797: 'epistasis', 798: 'conditional genetic interaction defined by inequality', - 799: 'additive genetic interaction defined by inequality', - 800: 'single nonmonotonic genetic interaction defined by inequality', - 801: 'double nonmonotonic genetic interaction defined by inequality', 802: 'enhancement interaction', - 803: 'expression level alteration', 804: 'mutated gene', 805: 'wwpdb', 806: 'pdbj', - 807: 'comigration in gel electrophoresis', 808: 'comigration in sds page', - 809: 'bimolecular fluorescence complementation', 810: 'substitution analysis', 811: 'insertion analysis', - 812: 'calmodulin binding protein tag', 813: 'proximity ligation assay', - 814: 'protease accessibility laddering', - 815: 'confirmation by molecular weight', 816: 'molecular weight estimation by staining', - 817: 'molecular weight estimation by silver staining', - 818: 'molecular weight estimation by coomasie staining', - 819: 'molecular weight estimation by bromide staining', 820: 'molecular weight estimation by sybr staining', - 821: 'molecular weight estimation by autoradiography', - 822: 'molecular weight estimation by hoechst staining', - 823: 'predetermined feature', 824: 'x-ray powder diffraction', 825: 'x-ray fiber diffraction', - 826: 'x ray scattering', - 827: 'x-ray tomography', 828: 'polyprotein fragment', 829: 'multiple parent reference', 830: 'tissue list', - 831: 'cell ontology', 832: 'half cystine', 833: 'autoradiography', 834: 'ka', 835: 'koff', - 836: 'temperature of interaction', 837: 'pH of interaction', 838: 'kelvin', 839: 'per mole per second', - 840: 'stimulator', 841: 'phosphotransferase assay', 842: 'phosphate donor', 843: 'phosphate acceptor', - 844: 'phosphotransfer reaction', 845: 'spin label', 846: 'r1 spin label', 847: 'dansyl label', - 848: '125i radiolabel', - 849: 'ncbi taxonomy', 850: 'encode', 851: 'protein genbank identifier', 852: 'nucleotide genbank identifier', - 853: 'dna overhang', 854: '3 prime overhang', 855: '5 prime overhang', 856: 'fluorophore', - 857: 'fluorescent dye label', 858: 'immunodepleted coimmunoprecipitation', 859: 'intermolecular force', - 860: 'genbank indentifier', 861: 'protein a tag', 862: 'zz tag', 863: 'thiol reactive lanthanide label', - 864: 'brenda', - 865: 'fluorescence acceptor donor pair', 866: 'tag visualisation', 867: 'tag visualisation by fluorescence', - 868: 'author identifier', 869: 'originally assigned identifier', 870: 'demethylase assay', - 871: 'demethylation reaction', 872: 'atomic force microscopy', 873: 'curation request', 875: 'dataset', - 878: 'author submitted', 879: 'nucleoside triphosphatase assay', 880: 'atpase assay', - 881: 'nucleoside triphosphatase reaction', 882: 'atpase reaction', 883: 'gtpase reaction', 884: 'vsv tag', - 885: 'journal', 886: 'publication year', 887: 'histone acetylase assay', - 888: 'small angle neutron scattering', - 889: 'acetylase assay', 890: 'qdot', 891: 'neutron fiber diffraction', 892: 'solid phase assay', - 893: 'neutron diffraction', 894: 'electron diffraction', 895: 'protein kinase A complementation', - 896: 'renilla luciferase protein tag', 897: 'protein modification ontology', 898: 'putative self', - 899: 'p3 filamentous phage display', 900: 'p8 filamentous phage display', 901: 'isotope label footprinting', - 902: 'rna cleavage', 903: 'mpidb', 904: 'polysaccharide', - 905: 'amplified luminescent proximity homogeneous assay', - 906: 'au1 tag', 907: 'conformational status', 908: 'denatured', 909: 'native', 910: 'nucleic acid cleavage', - 911: 'cross linker', 912: 'spdp cross linker', 913: 'lc-spdp cross linker', 914: 'association', - 915: 'physical association', 916: 'lexa vp16 complementation', 917: 'matrixdb', 918: 'donor', 919: 'acceptor', - 920: 'ribonuclease assay', 921: 'surface plasmon resonance array', 922: 'imex evidence', 923: 'irefindex', - 924: 'camjedb', 925: 'observed-ptm', 926: 'fiash label', 927: 'iaedans label', 928: 'filter trap assay', - 929: 'northern blot', 930: 'epistatis', 931: 'genetic interaction defined by inequality', - 932: 'noninteractive', - 933: 'negative genetic interaction', 934: 'neutral genetic interaction', - 935: 'positive genetic interaction', - 936: 'emdb', 937: 'glu tag', 938: 'rheology measurement', 939: 'fluorescein label', - 940: 'fluorescein-5-maleimide label', 941: 'competitor', 942: 'uniprot taxonomy', - 943: 'detection by mass spectrometry', 944: 'mass spectrometry study of hydrogen/deuterium exchange', - 945: 'oxidoreductase activity electron transfer reaction', 946: 'miniaturized immunoprecipitation', - 947: 'bead aggregation assay', 948: 'kinetic conditions', 949: 'gdp/gtp exchange assay', - 950: 'trapping mutant', - 951: 'chain parent sequence reference', 952: 'imex secondary', 953: 'polymerization', - 954: 'curation quality', - 955: 'curation depth', 956: 'curation coverage', 957: 'full coverage', 958: 'partial coverage', - 959: 'imex curation', - 960: 'mimix curation', 961: 'rapid curation', 962: 'strep ii tag', - 963: 'interactome parallel affinity capture', - 964: 'infrared spectroscopy', 965: '2d-infrared spectrometry', 966: 'ultraviolet-visible spectroscopy', - 967: 'chembl compound', 968: 'biosensor', 969: 'bio-layer interferometry', 970: 'inchi key', - 971: 'phosphopantetheinylation', 972: 'phosphopantetheinylase assay', 973: 'imex source', 974: 'innatedb', - 975: 'fc-igg tag', 976: 'total internal reflection fluorescence spectroscopy', 977: 'no-imex-export', - 978: 'author-name', 979: 'oxidoreductase assay', 980: 'tag visualisation by enzyme assay', - 981: 'tag visualisation by peroxidase activity', 982: 'electrophoretic mobility-based method', 983: 'gemma', - 984: 'deamination assay', 985: 'deamination reaction', 986: 'nucleic acid strand elongation reaction', - 987: 'rna strand elongation', 988: 'strep tag', 989: 'amidase assay', 990: 'cleavage assay', - 991: 'lipoprotein cleavage assay', 992: 'defarnesylase assay', 993: 'degeranylase assay', - 994: 'demyristoylase assay', - 995: 'depalmitoylase assay', 996: 'deformylase assay', 997: 'ubiquitinase assay', 998: 'deubiquitinase assay', - 999: 'formylase assay', 1000: 'hydroxylase assay', 1001: 'lipidase assay', 1002: 'myristoylase assay', - 1003: 'geranylgeranylase assay', 1004: 'palmitoylase assay', 1005: 'adp ribosylase assay', - 1006: 'deglycosylase assay', - 1007: 'glycosylase assay', 1008: 'sumoylase assay', 1009: 'desumoylase assay', 1010: 'neddylase assay', - 1011: 'deneddylase assay', 1012: 'sbp', 1013: 'ensemblgenomes', 1014: 'string', 1015: 'dictybase', - 1016: 'fluorescence recovery after photobleaching', 1017: 'rna immunoprecipitation', 1018: 'deltamass', - 1019: 'protein phosphatase assay', 1020: 'hilyte fluor 488', 1021: 'qx 520', - 1022: 'field flow fractionation', - 1023: 'luminogreen', 1024: 'scanning electron microscopy', 1025: 'unimod', 1026: 'diphtamidase assay', - 1027: 'diphtamidation reaction', 1028: 'modified chromatin immunoprecipitation', - 1029: 'proteomics of isolated chromatin segments', 1030: 'excimer fluorescence', - 1031: 'protein folding/unfolding', - 1032: 'atto 488', 1033: 'atto 550', 1034: 'nuclease assay', 1035: 'deoxyribonuclease assay', - 1036: 'nucleotide exchange assay', 1037: 'Split renilla luciferase complementation', - 1038: 'silicon nanowire field-effect transistor', 1039: 'c-terminal range', 1040: 'n-terminal range', - 1041: 'synonym', - 1042: 'pubmed central', 1043: 'flannotator', 1044: 'rice genome annotation project', - 1045: 'curation content', - 1046: 'interacting molecules', 1047: 'protein-protein', 1048: 'smallmolecule-protein', - 1049: 'nucleicacid-protein', - 1050: 'interaction representation', 1051: 'evidence', 1052: 'clustered', 1053: 'data source', - 1054: 'experimentally-observed', 1055: 'internally-curated', 1056: 'text-mining', 1057: 'predicted', - 1058: 'imported', - 1059: 'complex expansion', 1060: 'spoke expansion', 1061: 'matrix expansion', 1062: 'bipartite expansion', - 1063: 'consensuspathdb', 1064: 'interaction confidence', 1065: 'replication-based confidence', - 1066: 'structure-based confidence', 1067: 'function-based confidence', 1068: 'location-based confidence', - 1069: 'network-based confidence', 1070: 'standard-based confidence', 1071: 'literature-based confidence', - 1072: 'method-based confidence', 1073: 'statistical-based confidence', 1074: 'rgs-his tag', - 1075: 'beilstein', - 1076: 'einecs', 1077: 'merck index', 1078: 'plantgdb', 1079: 'ratmap', 1080: 'tair', 1081: 'tigr/jcvi', - 1082: 'zfin', - 1083: 'cog', 1084: 'photon donor', 1085: 'photon acceptor', 1086: 'equilibrium dialysis', - 1087: 'monoclonal antibody blockade', 1088: 'phenotype-based detection assay', - 1089: 'nuclear translocation assay', - 1090: 'bimane label', 1091: 'publication title', 1092: 'atto label', 1093: 'bibliographic attribute name', - 1094: 'genome databases', 1095: 'hgnc', 1096: 'protein sequence databases', 1097: 'uniprot', - 1098: 'uniprot/swiss-prot', 1099: 'uniprot/trembl', 1100: 'bioactive entity', 1101: 'standard inchi key', - 1102: 'mapped-identity', 1103: 'solution state nmr', 1104: 'solid state nmr', 1105: 'biocyc', - 1106: 'pathways database', 1107: 'pid', 1108: 'biocarta', 1109: 'gene database', - 1110: 'predicted interaction', - 1111: 'two hybrid bait or prey pooling approach', 1112: 'two hybrid prey pooling approach', - 1113: 'two hybrid bait and prey pooling approach', 1114: 'virhostnet', 1115: 'spike', 1116: 'genemania', - 1117: 'topfind', 1118: 'enhanced yellow fluorescent protein tag', 1119: 'nYFP', 1120: 'cYFP', 1121: 'ceYFP', - 1122: 'neYFP', 1123: 'bindingdb', 1124: 'pathwaycommons', 1125: 'direct binding region', - 1126: 'self interaction', - 1127: 'putative self interaction', 1128: 'mutation disrupting interaction strength', - 1129: 'mutation disrupting interaction rate', 1130: 'mutation decreasing interaction rate', - 1131: 'mutation increasing interaction rate', 1132: 'mutation increasing interaction strength', - 1133: 'mutation decreasing interaction strength', 1134: 'mcherry fluorescent protein tag', - 1135: 'venus fluorescent protein tag', 1136: 'kusabira-green protein tag', 1137: 'carboxylation assay', - 1138: 'decarboxylation assay', 1139: 'carboxylation reaction', 1140: 'decarboxylation reaction', - 1141: 's tag', - 1142: 'aminoacylation assay', 1143: 'aminoacylation reaction', 1144: 'protein a tag visualisation', - 1145: 'phospholipase assay', 1146: 'phospholipase reaction', 1147: 'ampylation assay', - 1148: 'ampylation reaction', - 1149: 'cooperative interaction', 1150: 'affected interaction', 1151: 'participant-ref', - 1152: 'cooperative effect value', 1153: 'cooperative effect outcome', 1154: 'positive cooperative effect', - 1155: 'negative cooperative effect', 1156: 'cooperative mechanism', 1157: 'allostery', 1158: 'pre-assembly', - 1159: 'allosteric molecule', 1160: 'allosteric effector', 1161: 'allosteric response', - 1162: 'allosteric k-type response', 1163: 'allosteric v-type response', 1164: 'allosteric mechanism', - 1165: 'allosteric change in structure', 1166: 'allosteric change in dynamics', 1167: 'allostery type', - 1168: 'heterotropic allostery', 1169: 'homotropic allostery', 1170: 'pre-assembly response', - 1171: 'composite binding site formation', 1172: 'altered physicochemical compatibility', - 1173: 'binding site hiding', - 1174: 'configurational pre-organization', 1175: 'allosteric post-translational modification', - 1176: 'sequence based prediction of gene regulatory region binding sites', - 1177: 'phylogenetic profiling of predicted gene regulatory region binding sites', - 1178: 'sequence based prediction of binding of transcription factor to transcribed gene regulatory elements', - 1179: 'partial nucleotide sequence identification', 1180: 'partial DNA sequence identification', - 1181: 'paired end tags sequence identification', 1182: 'full identification by RNA sequencing', - 1183: 'nuclease footprinting', 1184: 'dna adenine methyltransferase identification', - 1185: 'tag visualisation by dna adenine methyltransferase', 1186: 'dna methyltransferase tag', 1187: 'damip', - 1188: 'tag visualisation by mutated dna adenine methyltransferase', 1189: 'methylation interference assay', - 1190: 'hydroxy radical footprinting', 1191: 'ultraviolet (uv) footprinting', - 1192: 'antisense oligonucleotides', - 1193: 'partial RNA sequence identification', 1194: 'reverse transcription pcr', 1195: 'quantitative pcr', - 1196: 'quantitative reverse transcription pcr', 1197: 'radioimmunoassay', 1198: 'immunohistochemistry', - 1199: 'anti-tag immunohistochemistry', 1200: 'immunocytochemistry', 1201: 'anti-tag immunocytochemistry', - 1202: 'one-strep-tag', 1203: 'split luciferase complementation', - 1204: 'split firefly luciferase complementation', - 1205: 'luciferase tag', 1206: 'renilla-n', 1207: 'renilla-c', 1208: 'firefly luciferase protein tag', - 1209: 'firefly-c', 1210: 'firefly-n', 1211: 'liposome binding assay', 1212: 'checksum', 1213: 'n-venus', - 1214: 'c-venus', 1215: 'bifc tag', 1216: 'cGFP', 1217: 'nGFP', 1218: 'chromosome conformation capture assay', - 1219: 'enzyme-mediated activation of radical sources', 1220: 'tag visualisation by luciferase assay', - 1221: 'author-based confidence', 1222: 'mbinfo', 1223: 'ptm decreasing an interaction', - 1224: 'ptm increasing an interaction', 1225: 'ptm disrupting an interaction', 1226: 'ampylation assay', - 1227: 'lap tag', 1228: 'pyo tag', 1229: 'uridylation assay', 1230: 'uridylation reaction', - 1231: 'aminomethylcoumarin label', 1232: 'aggregation assay', 1233: 'resulting-cleavage', 1234: 'silac', - 1235: 'thermal shift binding', 1236: 'proline isomerase assay', 1237: 'proline isomerization reaction', - 1238: 'mass spectrometry studies of subunit exchange', 1239: 'amino-acid variant', - 1240: 'disease causing amino-acid variant', 1241: 'variant', 1242: 'fc-igg1', 1243: 'fc-igg2', 1244: 'mkate2', - 1245: 'mkate', 1246: 'ion mobility mass spectrometry of complexes', 1247: 'microscale thermophoresis', - 1248: 'bodipy label', 1249: 'isomerase assay', 1250: 'isomerase reaction', - 1251: 'methylmalonyl-CoA isomerase reaction', 1252: 'methylmalonyl-CoA isomerase asf say', 1253: 'atto 532', - 1254: 'atto 647', 1255: 'stilbene label', 1256: 'luminscent dye label', 1257: 'rhodamine label', - 1258: 'tetramethyl rhodamine label', 1259: 'acrylodan label', 1260: 'pyrene label', - 1261: 'oregon green label', - 1262: 'iid', 1263: 'molecular connections', 1264: 'ntnu', 1265: 'ubiquitin reconstruction tag', 1266: 'cub', - 1267: 'nub', 1268: 'nubg', 1269: 'duplicated protein', 1270: 'xpress tag', 1271: 'enhancement', - 1272: 'positive epistasis', 1273: 'maximal epistasis', 1274: 'minimal epistasis', 1275: 'neutral epistasis', - 1276: 'opposing epistasis', 1277: 'qualitative epistasis', 1278: 'mutual enhancement', - 1279: 'unilateral enhancement', - 1280: 'mutual suppression', 1281: 'mutual suppression (complete)', 1282: 'mutual suppression (partial)', - 1283: 'suppression-enhancement', 1284: 'quantitative epistasis', 1285: 'opposing epistasis', - 1286: 'over-suppression', - 1287: 'mutual over-suppression', 1288: 'over-suppression-enhancement', 1289: 'phenotype bias', - 1290: 'suppression (complete)', 1291: 'suppression (partial)', 1292: 'unilateral suppression', - 1293: 'unilateral suppression (complete)', 1294: 'unilateral suppression (partial)', - 1295: 'unilateral over-suppression', 1296: 'amino acid analysis', 1297: 'phosphoamino acid analysis', - 1298: 'complex type', 1299: 'complex composition', 1300: 'obligate complex', 1301: 'non-obligate complex', - 1302: 'stable complex', 1303: 'transient complex', 1304: 'molecule set', 1305: 'candidate set', - 1306: 'open set', - 1307: 'defined set', 1308: 'resulting sequence', 1309: 'de-ADP-ribosylation assay', - 1310: 'de-ADP-ribosylation reaction', 1311: 'differential scanning calorimetry', 1312: 'aut-page', - 1313: 'proximity labelling technology', 1314: 'proximity-dependent biotin identification', - 1315: 'complex recommended name', 1316: 'complex systematic name', 1317: 'eukaryotic linear motif resource', - 1318: 'sulfate donor', 1319: 'sulfate acceptor', 1320: 'membrane yeast two hybrid', - 1321: 'ire1 reconstruction', - 1322: 'atto 465', 1323: 'tag visualisation by alkaline phosphatase activity', 1324: 'conditioned medium', - 1325: 'sulfurtransferase assay', 1326: 'CLONE OF phosphotransfer reaction', 1327: 'sulfurtransfer reaction', - 1328: 'coumarin label', 1329: 'cpm', 1330: 'dnp', 1331: 'evidence ontology', 1332: 'bhf-ucl', 1333: 'rogid', - 1334: 'rigid', 1335: 'hpidb', 1336: 'experiment database', 1337: 'efo', 1338: 'eef tag', - 1339: 'supercharged green fluorescent protein', 1340: 'human orfeome collection', 1341: 'set member', - 1342: 'qcmd', - 1343: 'enzyme regulator', 1344: 'erythrosin iodoacetamide label', 1345: 'rho tag', 1346: 'bmrb', - 1347: 'protein ontology', 1348: 'chembl target', 1349: 'chembl', 1350: 'orphanet', 1351: 'inferred-from', - 1352: 'uracil interference assay', 1353: 'au5 tag', 1354: 'lipase assay', 1355: 'lipid cleavage', - 1356: 'validated two hybrid', 1357: 'RNAcentral', 2002: 'drugbank', 2003: 'commercial name', - 2004: 'drug brand name', - 2005: 'drug mixture brand name', 2006: 'biotech product preparation', 2007: 'iupac name', - 2008: 'chemical formula', - 2009: 'chemical structure', 2010: 'standard inchi', 2011: 'cas registry number', 2012: 'kegg compound', - 2013: 'pubchem', 2015: 'pharmgkb', 2016: 'bind smid', 2017: 'heterogen', - 2020: 'canadian drug identification number', - 2021: 'rxlist link', 2023: 'material safety data sheet', 2024: 'patent number', 2025: 'molecular weight', - 2026: 'melting point', 2027: 'water solubility', 2029: 'logp', 2030: 'isoelectric point', - 2033: 'hydrophobicity', - 2036: 'boiling point', 2039: 'smiles string', 2040: 'drug type', 2041: 'drug category', - 2042: 'disease indication', - 2043: 'pharmacology', 2044: 'mechanism of action', 2045: 'drug absorption', 2046: 'lethal dose 50', - 2047: 'percentage of plasma protein binding', 2048: 'drug biotransformation', 2049: 'elimination half life', - 2050: 'dosage form', 2051: 'patient information', 2053: 'contraindications', - 2054: 'bioactive entity reference', - 2055: 'chemical stability', 2064: 'solubility', 2084: 'organisms affected', - 2086: 'physicochemical attribute name', - 2089: 'bioactive entity attribute name', 2091: 'structure representation attribute name', - 2097: 'anti-convulsant', - 2098: 'anti-bacterial', 2099: 'fda approved drug', 2100: 'experimental drug', 2101: 'biotech drug', - 2102: 'nutraceutical drug', 2105: 'pka', 2106: 'degree of ionisation ph 7.4', 2107: 'logd', - 2108: 'solubility ph 7.4', - 2109: 'solubility in dmso', 2111: 'diffusion coefficient', 2112: 'chemical stability at pH 2', - 2113: 'dissolution profile', 2115: 'pharmacokinetics attribute name', 2116: 'cell permeability', - 2118: 'volume of distribution', 2120: 'tissue distribution', 2121: 'transporter binding', 2122: 'clearance', - 2123: 'renal clearance', 2124: 'total clearance', 2125: 'maximum absorbable dose', - 2126: 'paracellular absorption', - 2127: 'tmax/cmax', 2128: 'ABCB1 transporter substrate', 2129: 'bile transporter substrate', - 2130: 'cyp-450 inhibition', - 2131: 'metabolite identification', 2132: 'gsh adducts', - 2133: 'neutralization by glucuronidation or sulfatation', - 2135: 'toxicity attribute name', 2136: 'herg binding', 2137: 'genotoxicity', 2138: 'mutagenicity', - 2139: 'carcinogenicity', 2140: 'chromosome damage', 2141: 'hepatotoxicity', 2142: 'phospholipidosis', - 2145: 'solubility ph 6.5', 2146: 'solubility ph 2.0', 2147: 'chemical stability at pH 7.4', - 2148: 'investigational drug', 2149: 'withdrawn drug', 2150: 'illicit drug', 2151: 'other drug interaction', - 2152: 'food interaction', 2153: 'pdr health', 2154: 'wikipedia', 2155: 'average molecular weight', - 2156: 'monoisotopic molecular weight', 2157: 'experimental water solubility', - 2158: 'predicted water solubility', - 2160: 'logs', 2161: 'experimental logs', 2162: 'experimental CaCO2 permeability', 2163: 'by homology', - 2164: 'mind', - 2165: 'bar', 2166: 'ai', 2167: 'kinetic exclusion assay', 2168: 'conditional site labelling', - 2169: 'luminiscence technology', 2170: 'bimolecular luminiscence complementation', - 2171: 'complemented donor-acceptor resonance energy transfer', 2172: 'aspgd', 2173: 'cgd', 2174: 'ecoliwiki', - 2175: 'genedb', 2176: 'gramene', 2177: 'pombase', 2178: 'agi_locuscode', 2179: 'subset', 2180: 'agbase', - 2181: 'cacao', - 2182: 'dflat', 2183: 'go_central', 2184: 'mtbbase', 2185: 'parkinsonsuk-ucl', 2186: 'alut', 2187: 'ri', - 2188: 'par-clip', 2189: 'avexis', 2190: 'long non-coding ribonucleic acid', 2191: 'clip', 2192: 'clip-seq', - 2193: 'iclip', 2194: 'crac', 2195: 'clash', 2196: 'quartz crystal microbalance', - 2197: 'probe interaction assay', - 2198: 'labelling assay', 2199: 'specific site-labelling technology', 2200: 'primesdb', - 2201: 'DNA chemical modification', 2202: 'RNA chemical modification', 2203: 'primer extension assay', - 2204: 'micro rna', 2205: 'pir', 2206: 'observed nucleic acid chemical modification', - 2207: 'resulting nucleic acid chemical modification', 2208: 'prerequisite-nucleic acid chemical modification', - 2209: 'nucleic acid chemical modification decreasing an interaction', - 2210: 'nucleic acid chemical modification disrupting an interaction', - 2211: 'nucleic acid chemical modification increasing an interaction', 2212: 'proteomexchange', - 2213: 'super-resolution microscopy', 2214: 'signor', 2215: 'barcode fusion genetics two hybrid', - 2216: 'deampylation assay', 2217: 'luciferase-c', 2218: 'luciferase-n', - 2219: 'gaussia luciferase protein tag', - 2220: 'gaussia-c', 2221: 'gaussia-n', 2222: 'inference by socio-affinity scoring', - 2223: 'inference by quantitative co-purification', - 2224: 'chemical rna modification plus base pairing prediction', - 2225: 'zinc', 2226: 'mutation with no effect', 2227: 'mutation causing an interaction', 2228: 'ceitec', - 2229: 'nucleicacid-gene', 2230: 'nucleicacid-nucleicacid', 2231: 'coexpression', - 2232: 'molecular association', - 2233: 'causal interaction', 2234: 'causal statement', 2235: 'up-regulates', 2236: 'up-regulates activity', - 2237: 'up-regulates quantity', 2238: 'up-regulates quantity by expression', - 2239: 'up-regulates quantity by stabilization', 2240: 'down-regulates', 2241: 'down-regulates activity', - 2242: 'down-regulates quantity', 2243: 'down-regulates quantity by repression', - 2244: 'down-regulates quantity by destabilization', 2245: 'causal regulatory mechanism', - 2246: 'indirect causal regulation', 2247: 'transcriptional regulation', 2248: 'translation regulation', - 2249: 'post transcriptional regulation', 2250: 'direct causal regulation', - 2251: 'transcriptional regulation by direct binding of dbTF to DNA regulatory element', - 2252: 'guanine nucleotide exchange factor reaction', 2253: 'gtpase-activating protein reaction', - 2254: 'chemical activation reaction', 2255: 'chemical inhibition reaction', 2256: 'relocalization', - 2257: 'small molecule catalysis reaction', 2258: 'xenobiotic', 2259: 'causal interactor type', - 2260: 'stimulus', - 2261: 'phenotype', 2262: 'causal regulatory modification', 2263: 's-nitrosylation', - 2264: 'tyrosinated residue', - 2265: 'de-acetylated residue', 2266: 'de-phosphorylated residue', 2267: 'de-sumoylated residue', - 2268: 'de-methylated residue', 2269: 'de-ubiquitinylated residue', 2270: 'signalink', 2271: 'edam', - 2272: 'tyrosinylation', 2273: 'tyrosination', 2274: 'regulator', 2275: 'regulator target', - 2276: 'carbohydrate chemical modification', 2277: 'Cr-two hybrid', 2278: 'polymer chain length', - 2279: 'complex portal', 2280: 'deamidation reaction', 2281: 'deamidation assay', 2282: 'complex-primary', - 2283: 'southwestern blotting', 2284: 'complex component', - 2285: 'miRNA interference luciferase reporter assay', - 2286: 'functional association', 2287: 'identification by structure determination', 2288: 'DAP-seq'} +types = { + 0: "molecular interaction", + 1: "interaction detection method", + 2: "participant identification method", + 3: "feature detection method", + 4: "affinity chromatography technology", + 5: "alanine scanning", + 6: "anti bait coimmunoprecipitation", + 7: "anti tag coimmunoprecipitation", + 8: "array technology", + 9: "bacterial display", + 10: "beta galactosidase complementation", + 11: "beta lactamase complementation", + 12: "bioluminescence resonance energy transfer", + 13: "biophysical", + 14: "adenylate cyclase complementation", + 16: "circular dichroism", + 17: "classical fluorescence spectroscopy", + 18: "two hybrid", + 19: "coimmunoprecipitation", + 20: "transmission electron microscopy", + 21: "colocalization by fluorescent probes cloning", + 22: "colocalization by immunostaining", + 23: "colocalization/visualisation technologies", + 24: "confirmational text mining", + 25: "copurification", + 26: "correlated mutations", + 27: "cosedimentation", + 28: "cosedimentation in solution", + 29: "cosedimentation through density gradient", + 30: "cross-linking study", + 31: "protein cross-linking with a bifunctional reagent", + 32: "de novo protein sequencing by mass spectrometry", + 33: "deletion analysis", + 34: "display technology", + 35: "docking", + 36: "domain fusion", + 37: "domain profile pairs", + 38: "dynamic light scattering", + 39: "edman degradation", + 40: "electron microscopy", + 41: "electron nuclear double resonance", + 42: "electron paramagnetic resonance", + 43: "electron resonance", + 45: "experimental interaction detection", + 46: "experimental knowledge based", + 47: "far western blotting", + 48: "filamentous phage display", + 49: "filter binding", + 50: "flag tag coimmunoprecipitation", + 51: "fluorescence technology", + 52: "fluorescence correlation spectroscopy", + 53: "fluorescence polarization spectroscopy", + 54: "fluorescence-activated cell sorting", + 55: "fluorescent resonance energy transfer", + 56: "full identification by DNA sequencing", + 57: "gene neighbourhood", + 58: "genome based prediction", + 59: "gst pull down", + 60: "ha tag coimmunoprecipitation", + 61: "his pull down", + 62: "his tag coimmunoprecipitation", + 63: "interaction prediction", + 64: "interologs mapping", + 65: "isothermal titration calorimetry", + 66: "lambda phage display", + 67: "light scattering", + 68: "mass detection of residue modification", + 69: "mass spectrometry studies of complexes", + 70: "mobility shift", + 71: "molecular sieving", + 72: "monoclonal antibody western blot", + 73: "mrna display", + 74: "mutation analysis", + 75: "myc tag coimmunoprecipitation", + 76: "neural network on interface properties", + 77: "nuclear magnetic resonance", + 78: "nucleotide sequence identification", + 79: "other biochemical technologies", + 80: "partial DNA sequence identification by hybridization", + 81: "peptide array", + 82: "peptide massfingerprinting", + 83: "peptide synthesis", + 84: "phage display", + 85: "phylogenetic profile", + 86: "polyclonal antibody western blot", + 87: "predictive text mining", + 88: "primer specific pcr", + 89: "protein array", + 90: "protein complementation assay", + 91: "chromatography technology", + 92: "protein in situ array", + 93: "protein sequence identification", + 94: "protein staining", + 95: "proteinchip(r) on a surface-enhanced laser desorption/ionization", + 96: "pull down", + 97: "reverse ras recruitment system", + 98: "ribosome display", + 99: "scintillation proximity assay", + 100: "sequence based phylogenetic profile", + 101: "sequence based prediction", + 102: "sequence tag identification", + 103: "southern blot", + 104: "static light scattering", + 105: "structure based prediction", + 106: "surface patches", + 107: "surface plasmon resonance", + 108: "t7 phage display", + 109: "tap tag coimmunoprecipitation", + 110: "text mining", + 111: "dihydrofolate reductase reconstruction", + 112: "ubiquitin reconstruction", + 113: "western blot", + 114: "x-ray crystallography", + 115: "yeast display", + 116: "feature type", + 117: "binding-associated region", + 118: "mutation", + 119: "mutation decreasing interaction", + 120: "post translation modification", + 121: "acetylated residue", + 122: "n-acetyl-alanine", + 123: "n2-acetyl-arginine", + 124: "n-acetyl-asparagine", + 125: "n-acetyl-aspartic acid", + 126: "n-acetyl-cysteine", + 127: "n-acetyl-glutamine", + 128: "n-acetyl-glutamic acid", + 129: "n-acetylglycine", + 130: "n-acetyl-histidine", + 131: "n-acetyl-isoleucine", + 132: "n-acetyl-leucine", + 133: "n2-acetyl-lysine", + 134: "n6-acetyl-lysine", + 135: "n-acetyl-methionine", + 136: "n-acetyl-phenylalanine", + 137: "n-acetyl-proline", + 138: "n-acetyl-serine", + 139: "n-acetyl-threonine", + 140: "n-acetyl-tryptophan", + 141: "n-acetyl-tyrosine", + 142: "n-acetyl-valine", + 143: "amidated residue", + 145: "arginine amide", + 146: "formylated residue", + 147: "n-formyl-methionine", + 148: "hydroxylated residue", + 150: "lipid modification", + 151: "s-farnesyl-cysteine", + 152: "s-geranylgeranyl-cysteine", + 153: "n-palmitoyl-cysteine", + 154: "s-palmitoyl-cysteine", + 155: "n-myristoyl-glycine", + 156: "n6-myristoyl-lysine", + 157: "methylated residue", + 158: "n-methyl-alanine", + 159: "n,n,n-trimethyl-alanine", + 160: "omega-n,omega-n-dimethyl-arginine", + 161: "beta-methylthioaspartic acid", + 162: "n5-methyl-glutamine", + 163: "glutamic acid 5-methyl ester", + 165: "n6-methyl-lysine", + 166: "n6,n6-dimethyl-lysine", + 167: "n6,n6,n6-trimethyl-lysine", + 168: "n-methyl-methionine", + 169: "n-methyl-phenylalanine", + 170: "phosphorylated residue", + 171: "omega-n-phospho-arginine", + 172: "aspartic 4-phosphoric anhydride", + 173: "s-phospho-cysteine", + 176: "o-phospho-serine", + 177: "o-phospho-threonine", + 178: "o4'-phospho-tyrosine", + 179: "other modification", + 180: "selenocysteine", + 181: "selenomethionine", + 182: "3-oxoalanine", + 184: "glutamyl 5-glycerylphosphorylethanolamine", + 186: "n6-biotinyl-lysine", + 187: "n6-(4-amino-2-hydroxybutyl)-lysine", + 188: "n6-retinal-lysine", + 189: "ubiquitinated lysine", + 190: "interaction type", + 191: "aggregation", + 192: "acetylation reaction", + 193: "amidation reaction", + 194: "cleavage reaction", + 195: "covalent binding", + 196: "covalent interaction", + 197: "deacetylation reaction", + 198: "defarnesylation reaction", + 199: "deformylation reaction", + 200: "degeranylation reaction", + 201: "demyristoylation reaction", + 202: "depalmitoylation reaction", + 203: "dephosphorylation reaction", + 204: "deubiquitination reaction", + 205: "disaggregation", + 206: "farnesylation reaction", + 207: "formylation reaction", + 208: "genetic interaction", + 209: "geranylgeranylation reaction", + 210: "hydroxylation reaction", + 211: "lipid addition", + 212: "lipoprotein cleavage reaction", + 213: "methylation reaction", + 214: "myristoylation reaction", + 215: "non covalent interaction", + 216: "palmitoylation reaction", + 217: "phosphorylation reaction", + 218: "physical interaction", + 219: "synthetic lethal", + 220: "ubiquitination reaction", + 221: "expression level", + 222: "physiological level", + 223: "under expressed level", + 225: "chromatin immunoprecipitation array", + 226: "ion exchange chromatography", + 227: "reverse phase chromatography", + 228: "cytoplasmic complementation assay", + 230: "membrane bound complementation assay", + 231: "mammalian protein protein interaction trap", + 232: "transcriptional complementation assay", + 233: "protein dna complex", + 234: "131i radiolabel", + 235: "14c radiolabel", + 236: "32p radiolabel", + 237: "33p radiolabel", + 238: "3h radiolabel", + 239: "biotin tag", + 240: "fusion protein", + 241: "horseradish peroxidase tag", + 242: "gene ontology definition reference", + 243: "isoform parent sequence reference", + 244: "reactome complex", + 245: "reactome protein", + 246: "cabri", + 247: "newt", + 248: "resid", + 249: "huge", + 250: "gene", + 251: "gene product", + 252: "biological feature", + 253: "isotope label", + 254: "genetic interference", + 255: "post transcriptional interference", + 256: "rna interference", + 257: "antisense rna", + 258: "inhibitor antibodies", + 259: "perturbagens peptides", + 260: "inhibitor small molecules", + 261: "suppression", + 262: "suppression mutation", + 263: "suppression knockout", + 264: "suppression partial alteration", + 265: "suppression expression alteration", + 266: "suppression overexpression", + 267: "suppression scalable", + 268: "suppression underexpression", + 269: "synthetic phenotype", + 270: "conditional synthetic lethal", + 271: "conditional synthetic lethal temperature-sensitivity", + 273: "synthetic growth effect", + 274: "synthetic growth defect", + 275: "synthetic growth increase", + 276: "blue native page", + 300: "alias type", + 301: "gene name", + 302: "gene name synonym", + 303: "gene ontology synonym", + 304: "isoform synonym", + 305: "ordered locus name", + 306: "open reading frame name", + 307: "delivery method", + 308: "electroporation", + 309: "genomic tagging", + 310: "infection", + 311: "microinjection", + 312: "nucleic acid transfection", + 313: "interactor type", + 314: "complex", + 315: "protein complex", + 316: "ribonucleoprotein complex", + 317: "interaction", + 318: "nucleic acid", + 319: "deoxyribonucleic acid", + 320: "ribonucleic acid", + 321: "catalytic rna", + 322: "guide rna", + 323: "heterogeneous nuclear rna", + 324: "messenger rna", + 325: "transfer rna", + 326: "protein", + 327: "peptide", + 328: "small molecule", + 329: "unknown participant", + 330: "molecular source", + 331: "engineered", + 332: "naturally occurring", + 333: "feature range status", + 334: "c-terminal position", + 335: "certain sequence position", + 336: "greater-than", + 337: "less-than", + 338: "range", + 339: "undetermined sequence position", + 340: "n-terminal position", + 341: "ragged n-terminus", + 342: "sample process", + 343: "cdna library", + 344: "cell lysate", + 345: "author assigned name", + 346: "experimental preparation", + 348: "fixed cell", + 349: "living cell", + 350: "purified", + 351: "homogeneous", + 352: "partially purified", + 353: "cross-reference type", + 354: "cellular component", + 355: "molecular function", + 356: "identical object in an external resource", + 357: "method reference", + 358: "primary-reference", + 359: "biological process", + 360: "secondary accession number", + 361: "additional information", + 362: "inference", + 363: "inferred by author", + 364: "inferred by curator", + 365: "enzyme tag", + 366: "alkaline phosphatase tag", + 367: "green fluorescent protein tag", + 368: "yellow fluorescent protein tag", + 369: "lex-a dimerization assay", + 370: "tox-r dimerization assay", + 371: "35s radiolabel", + 372: "subcellular preparation", + 373: "dye label", + 374: "cyanine label", + 375: "cy3 label", + 376: "cy5 label", + 377: "fluorescein isothiocyanate label", + 378: "rare isotope label", + 379: "13c label", + 380: "15n label", + 381: "2h label", + 382: "mutation increasing interaction", + 383: "biopolymer", + 384: "alexa label", + 385: "alexa 350 label", + 386: "alexa 430 label", + 387: "alexa 488 label", + 388: "alexa 532 label", + 389: "alexa 546 label", + 390: "alexa 568 label", + 391: "alexa 594 label", + 396: "predetermined participant", + 397: "two hybrid array", + 398: "two hybrid pooling approach", + 399: "two hybrid fragment pooling approach", + 400: "affinity technology", + 401: "biochemical", + 402: "chromatin immunoprecipitation assay", + 403: "colocalization", + 404: "comigration in non denaturing gel electrophoresis", + 405: "competition binding", + 406: "deacetylase assay", + 407: "direct interaction", + 408: "disulfide bond", + 409: "dna footprinting", + 410: "electron tomography", + 411: "enzyme linked immunosorbent assay", + 412: "electrophoretic mobility supershift assay", + 413: "electrophoretic mobility shift assay", + 414: "enzymatic reaction", + 415: "enzymatic study", + 416: "fluorescence microscopy", + 417: "footprinting", + 418: "genetic", + 419: "gtpase assay", + 420: "kinase homogeneous time resolved fluorescence", + 421: "identification by antibody", + 422: "immunostaining", + 423: "in-gel kinase assay", + 424: "protein kinase assay", + 425: "kinase scintillation proximity assay", + 426: "light microscopy", + 427: "Identification by mass spectrometry", + 428: "imaging technique", + 429: "necessary binding region", + 430: "nucleic acid uv cross-linking assay", + 431: "obsolete", + 432: "one hybrid", + 433: "partial identification of protein sequence", + 434: "phosphatase assay", + 435: "protease assay", + 436: "protein footprinting", + 437: "protein three hybrid", + 438: "rna three hybrid", + 439: "random spore analysis", + 440: "saturation binding", + 441: "synthetic genetic analysis", + 442: "sufficient binding region", + 443: "ubiquitin binding", + 444: "database citation", + 445: "literature database", + 446: "pubmed", + 447: "feature database", + 448: "gene ontology", + 449: "interpro", + 450: "cdd", + 451: "pfam", + 452: "pirsf", + 453: "prints", + 454: "prodom", + 455: "prosite", + 456: "scop superfamily", + 457: "smart", + 458: "tigrfams", + 459: "mmdb", + 460: "rcsb pdb", + 461: "interaction database", + 462: "bind", + 463: "biogrid", + 464: "cygd", + 465: "dip", + 466: "ecocyc", + 467: "reactome", + 468: "hprd", + 469: "intact", + 470: "kegg", + 471: "mint", + 472: "pdbe", + 473: "participant database", + 474: "chebi", + 475: "ddbj/embl/genbank", + 476: "ensembl", + 477: "entrez gene/locuslink", + 478: "flybase", + 479: "mgd/mgi", + 480: "omim", + 481: "refseq", + 482: "rfam", + 483: "rgd", + 484: "sgd", + 485: "uniparc", + 486: "uniprot knowledge base", + 487: "wormbase", + 488: "psi-mi", + 489: "source database", + 490: "experiment condition", + 491: "in silico", + 492: "in vitro", + 493: "in vivo", + 494: "in situ", + 495: "experimental role", + 496: "bait", + 497: "neutral component", + 498: "prey", + 499: "unspecified role", + 500: "biological role", + 501: "enzyme", + 502: "enzyme target", + 503: "self", + 505: "experimental feature", + 506: "over expressed level", + 507: "tag", + 508: "deacetylase radiometric assay", + 509: "phosphatase homogeneous time resolved fluorescence", + 510: "homogeneous time resolved fluorescence", + 511: "protease homogeneous time resolved fluorescence", + 512: "zymography", + 513: "collagen film assay", + 514: "in gel phosphatase assay", + 515: "methyltransferase assay", + 516: "methyltransferase radiometric assay", + 517: "radiolabel", + 518: "flag tag", + 519: "glutathione s tranferase tag", + 520: "ha tag", + 521: "his tag", + 522: "myc tag", + 523: "t7 tag", + 524: "calmodulin binding peptide plus protein a tag", + 525: "v5 tag", + 526: "n-acetyl-lysine", + 527: "adp ribosylated residue", + 528: "omega-n-(adp-ribosyl)-arginine", + 529: "s-(adp-ribosyl)-cysteine", + 530: "glutamyl-5-poly(adp-ribose)", + 531: "o-(adp-ribosyl)-serine", + 532: "n4-(adp-ribosyl)-asparagine", + 533: "glycosylated residue", + 534: "glycosyl-cysteine", + 535: "glycosyl-serine", + 536: "glycosyl-threonine", + 537: "omega-n-glycosyl-arginine", + 538: "n4-glycosyl-asparagine", + 539: "gpi anchor residue", + 540: "gpi-anchor amidated alanine", + 541: "gpi-anchor amidated asparagine", + 542: "gpi-anchor amidated aspartate", + 543: "gpi-anchor amidated cysteine", + 544: "gpi-anchor amidated glycine", + 545: "gpi-anchor amidated serine", + 546: "gpi-anchor amidated threonine", + 547: "s-prenyl-cysteine", + 548: "methylated-lysine", + 549: "alkylated cysteine", + 550: "gamma-carboxyglutamic acid", + 551: "nitro-tyrosine", + 552: "s-nitrosyl-cysteine", + 553: "o4'-sulfo-tyrosine", + 554: "sumoylated lysine", + 555: "phospho-histidine", + 556: "transglutamination reaction", + 557: "adp ribosylation reaction", + 558: "deglycosylation reaction", + 559: "glycosylation reaction", + 560: "myristoylated residue", + 561: "palmitoylated residue", + 562: "methylated alanine", + 563: "methylated arginine", + 564: "omega-n-methyl-arginine", + 565: "neddylated lysine", + 566: "sumoylation reaction", + 567: "neddylation reaction", + 568: "desumoylation reaction", + 569: "deneddylation reaction", + 570: "protein cleavage", + 571: "mrna cleavage", + 572: "dna cleavage", + 573: "mutation disrupting interaction", + 574: "digital object identifier", + 575: "alliance for cellular signaling", + 576: "structural proximity", + 577: "feature prediction from structure", + 578: "maltose binding protein tag", + 579: "electron donor", + 580: "electron acceptor", + 581: "suppressor gene", + 582: "suppressed gene", + 583: "fluorescence donor", + 584: "fluorescence acceptor", + 585: "intenz", + 586: "inhibitor", + 587: "inhibited", + 588: "three hybrid", + 589: "in vitro translated protein", + 590: "attribute name", + 591: "experiment description", + 592: "ipfam", + 593: "translocation", + 594: "translocation start", + 595: "translocation end", + 596: "experimental form description", + 597: "feature description", + 598: "feature constraint", + 599: "figure legend", + 600: "conditional synthetic lethal nutrition-sensitivity", + 601: "sequence ontology", + 602: "chemical footprinting", + 603: "dimethylsulphate footprinting", + 604: "potassium permanganate footprinting", + 605: "enzymatic footprinting", + 606: "DNase I footprinting", + 607: "small nuclear rna", + 608: "ribosomal rna", + 609: "small nucleolar rna", + 610: "small interfering rna", + 611: "signal recognition particle rna", + 612: "comment", + 613: "function", + 614: "url", + 615: "search-url", + 616: "example", + 617: "disease", + 618: "caution", + 619: "pathway", + 620: "search-url-ascii", + 621: "author-confidence", + 622: "confidence-mapping", + 623: "inhibition", + 624: "stimulant", + 625: "agonist", + 626: "antagonist", + 627: "experiment modification", + 628: "validation regular expression", + 629: "complex-properties", + 630: "3d-structure", + 631: "3d-r-factors", + 632: "3d-resolution", + 633: "data-processing", + 634: "contact-email", + 635: "contact-comment", + 636: "author-list", + 637: "isoform-comment", + 638: "prerequisite-ptm", + 639: "resulting-ptm", + 640: "parameter type", + 641: "ic50", + 642: "ec50", + 643: "ki", + 644: "km", + 645: "kcat", + 646: "Kd", + 647: "parameter unit", + 648: "molar", + 649: "second", + 650: "millimolar", + 651: "micromolar", + 652: "nanomolar", + 653: "picomolar", + 654: "fentomolar", + 655: "lambda repressor two hybrid", + 656: "identified peptide", + 657: "systematic evolution of ligands by exponential enrichment", + 658: "multidimensional protein identification technology", + 659: "experimental feature detection", + 660: "feature prediction", + 661: "experimental participant identification", + 662: "imex-primary", + 663: "confocal microscopy", + 664: "interaction attribute name", + 665: "experiment attibute name", + 666: "participant attribute name", + 667: "controlled vocabulary attribute name", + 668: "feature attribute name", + 669: "organism attribute name", + 670: "imex", + 671: "antibodies", + 672: "library-used", + 673: "complex synonym", + 674: "peptide parent sequence reference", + 675: "international protein index", + 676: "tandem affinity purification", + 677: "tandem tag", + 678: "antibody array", + 679: "poly adenine", + 680: "single stranded deoxyribonucleic acid", + 681: "double stranded deoxyribonucleic acid", + 682: "cofactor", + 683: "sequence database", + 684: "ancillary", + 685: "source reference", + 686: "unspecified method", + 687: "fluorescent protein tag", + 688: "dna binding domain tag", + 689: "activation domain tag", + 690: "gal4 activation domain", + 691: "vp16 activation domain", + 692: "b42 activation domain", + 693: "gal4 dna binding domain", + 694: "lexa dna binding domain", + 695: "sandwich immunoassay", + 696: "polymerase assay", + 697: "dna directed dna polymerase assay", + 698: "dna directed rna polymerase assay", + 699: "rna directed dna polymerase assay", + 700: "rna directed rna polymerase assay", + 701: "dna strand elongation", + 702: "panther", + 703: "gene3d", + 704: "nucleic acid delivery", + 705: "anti tag western blot", + 706: "nucleic acid transformation", + 707: "anti tag immunostaining", + 708: "monoclonal antibody immunostaining", + 709: "polyclonal antibody immunostaining", + 710: "nucleic acid transformation by treatment with divalent cation", + 711: "nucleic acid electroporation", + 712: "nucleic acid microinjection", + 713: "nucleic acid passive uptake", + 714: "nucleic acid transduction", + 715: "nucleic acid conjugation", + 716: "passive uptake", + 717: "nucleic acid transfection with liposome", + 718: "nucleic acid transfection by treatment", + 719: "calcium phosphate nucleic acid transfection", + 720: "nucleic acid delivery by infection", + 721: "protein delivery", + 722: "protein electroporation", + 723: "protein microinjection", + 724: "protein delivery by cationic lipid treatment", + 725: "protein delivery by infection", + 726: "reverse two hybrid", + 727: "lexa b52 complementation", + 728: "gal4 vp16 complementation", + 729: "luminescence based mammalian interactome mapping", + 730: "pubchem", + 731: "3d repertoire", + 732: "red fluorescent protein tag", + 733: "cyan fluorescent protein tag", + 734: "enhanced green fluorescent protein tag", + 735: "transactivating tag", + 736: "protein passive uptake", + 737: "peptide sequence database", + 738: "pride", + 739: "penetrating tag", + 740: "cell penetrating peptide tag", + 741: "peptide atlas", + 742: "gpm", + 787: "genetic experimental form", + 788: "knock out", + 789: "knock down", + 790: "hypermorph", + 791: "hypomorph", + 792: "antimorph", + 793: "amorph", + 794: "synthetic", + 795: "asynthetic", + 796: "suppression", + 797: "epistasis", + 798: "conditional genetic interaction defined by inequality", + 799: "additive genetic interaction defined by inequality", + 800: "single nonmonotonic genetic interaction defined by inequality", + 801: "double nonmonotonic genetic interaction defined by inequality", + 802: "enhancement interaction", + 803: "expression level alteration", + 804: "mutated gene", + 805: "wwpdb", + 806: "pdbj", + 807: "comigration in gel electrophoresis", + 808: "comigration in sds page", + 809: "bimolecular fluorescence complementation", + 810: "substitution analysis", + 811: "insertion analysis", + 812: "calmodulin binding protein tag", + 813: "proximity ligation assay", + 814: "protease accessibility laddering", + 815: "confirmation by molecular weight", + 816: "molecular weight estimation by staining", + 817: "molecular weight estimation by silver staining", + 818: "molecular weight estimation by coomasie staining", + 819: "molecular weight estimation by bromide staining", + 820: "molecular weight estimation by sybr staining", + 821: "molecular weight estimation by autoradiography", + 822: "molecular weight estimation by hoechst staining", + 823: "predetermined feature", + 824: "x-ray powder diffraction", + 825: "x-ray fiber diffraction", + 826: "x ray scattering", + 827: "x-ray tomography", + 828: "polyprotein fragment", + 829: "multiple parent reference", + 830: "tissue list", + 831: "cell ontology", + 832: "half cystine", + 833: "autoradiography", + 834: "ka", + 835: "koff", + 836: "temperature of interaction", + 837: "pH of interaction", + 838: "kelvin", + 839: "per mole per second", + 840: "stimulator", + 841: "phosphotransferase assay", + 842: "phosphate donor", + 843: "phosphate acceptor", + 844: "phosphotransfer reaction", + 845: "spin label", + 846: "r1 spin label", + 847: "dansyl label", + 848: "125i radiolabel", + 849: "ncbi taxonomy", + 850: "encode", + 851: "protein genbank identifier", + 852: "nucleotide genbank identifier", + 853: "dna overhang", + 854: "3 prime overhang", + 855: "5 prime overhang", + 856: "fluorophore", + 857: "fluorescent dye label", + 858: "immunodepleted coimmunoprecipitation", + 859: "intermolecular force", + 860: "genbank indentifier", + 861: "protein a tag", + 862: "zz tag", + 863: "thiol reactive lanthanide label", + 864: "brenda", + 865: "fluorescence acceptor donor pair", + 866: "tag visualisation", + 867: "tag visualisation by fluorescence", + 868: "author identifier", + 869: "originally assigned identifier", + 870: "demethylase assay", + 871: "demethylation reaction", + 872: "atomic force microscopy", + 873: "curation request", + 875: "dataset", + 878: "author submitted", + 879: "nucleoside triphosphatase assay", + 880: "atpase assay", + 881: "nucleoside triphosphatase reaction", + 882: "atpase reaction", + 883: "gtpase reaction", + 884: "vsv tag", + 885: "journal", + 886: "publication year", + 887: "histone acetylase assay", + 888: "small angle neutron scattering", + 889: "acetylase assay", + 890: "qdot", + 891: "neutron fiber diffraction", + 892: "solid phase assay", + 893: "neutron diffraction", + 894: "electron diffraction", + 895: "protein kinase A complementation", + 896: "renilla luciferase protein tag", + 897: "protein modification ontology", + 898: "putative self", + 899: "p3 filamentous phage display", + 900: "p8 filamentous phage display", + 901: "isotope label footprinting", + 902: "rna cleavage", + 903: "mpidb", + 904: "polysaccharide", + 905: "amplified luminescent proximity homogeneous assay", + 906: "au1 tag", + 907: "conformational status", + 908: "denatured", + 909: "native", + 910: "nucleic acid cleavage", + 911: "cross linker", + 912: "spdp cross linker", + 913: "lc-spdp cross linker", + 914: "association", + 915: "physical association", + 916: "lexa vp16 complementation", + 917: "matrixdb", + 918: "donor", + 919: "acceptor", + 920: "ribonuclease assay", + 921: "surface plasmon resonance array", + 922: "imex evidence", + 923: "irefindex", + 924: "camjedb", + 925: "observed-ptm", + 926: "fiash label", + 927: "iaedans label", + 928: "filter trap assay", + 929: "northern blot", + 930: "epistatis", + 931: "genetic interaction defined by inequality", + 932: "noninteractive", + 933: "negative genetic interaction", + 934: "neutral genetic interaction", + 935: "positive genetic interaction", + 936: "emdb", + 937: "glu tag", + 938: "rheology measurement", + 939: "fluorescein label", + 940: "fluorescein-5-maleimide label", + 941: "competitor", + 942: "uniprot taxonomy", + 943: "detection by mass spectrometry", + 944: "mass spectrometry study of hydrogen/deuterium exchange", + 945: "oxidoreductase activity electron transfer reaction", + 946: "miniaturized immunoprecipitation", + 947: "bead aggregation assay", + 948: "kinetic conditions", + 949: "gdp/gtp exchange assay", + 950: "trapping mutant", + 951: "chain parent sequence reference", + 952: "imex secondary", + 953: "polymerization", + 954: "curation quality", + 955: "curation depth", + 956: "curation coverage", + 957: "full coverage", + 958: "partial coverage", + 959: "imex curation", + 960: "mimix curation", + 961: "rapid curation", + 962: "strep ii tag", + 963: "interactome parallel affinity capture", + 964: "infrared spectroscopy", + 965: "2d-infrared spectrometry", + 966: "ultraviolet-visible spectroscopy", + 967: "chembl compound", + 968: "biosensor", + 969: "bio-layer interferometry", + 970: "inchi key", + 971: "phosphopantetheinylation", + 972: "phosphopantetheinylase assay", + 973: "imex source", + 974: "innatedb", + 975: "fc-igg tag", + 976: "total internal reflection fluorescence spectroscopy", + 977: "no-imex-export", + 978: "author-name", + 979: "oxidoreductase assay", + 980: "tag visualisation by enzyme assay", + 981: "tag visualisation by peroxidase activity", + 982: "electrophoretic mobility-based method", + 983: "gemma", + 984: "deamination assay", + 985: "deamination reaction", + 986: "nucleic acid strand elongation reaction", + 987: "rna strand elongation", + 988: "strep tag", + 989: "amidase assay", + 990: "cleavage assay", + 991: "lipoprotein cleavage assay", + 992: "defarnesylase assay", + 993: "degeranylase assay", + 994: "demyristoylase assay", + 995: "depalmitoylase assay", + 996: "deformylase assay", + 997: "ubiquitinase assay", + 998: "deubiquitinase assay", + 999: "formylase assay", + 1000: "hydroxylase assay", + 1001: "lipidase assay", + 1002: "myristoylase assay", + 1003: "geranylgeranylase assay", + 1004: "palmitoylase assay", + 1005: "adp ribosylase assay", + 1006: "deglycosylase assay", + 1007: "glycosylase assay", + 1008: "sumoylase assay", + 1009: "desumoylase assay", + 1010: "neddylase assay", + 1011: "deneddylase assay", + 1012: "sbp", + 1013: "ensemblgenomes", + 1014: "string", + 1015: "dictybase", + 1016: "fluorescence recovery after photobleaching", + 1017: "rna immunoprecipitation", + 1018: "deltamass", + 1019: "protein phosphatase assay", + 1020: "hilyte fluor 488", + 1021: "qx 520", + 1022: "field flow fractionation", + 1023: "luminogreen", + 1024: "scanning electron microscopy", + 1025: "unimod", + 1026: "diphtamidase assay", + 1027: "diphtamidation reaction", + 1028: "modified chromatin immunoprecipitation", + 1029: "proteomics of isolated chromatin segments", + 1030: "excimer fluorescence", + 1031: "protein folding/unfolding", + 1032: "atto 488", + 1033: "atto 550", + 1034: "nuclease assay", + 1035: "deoxyribonuclease assay", + 1036: "nucleotide exchange assay", + 1037: "Split renilla luciferase complementation", + 1038: "silicon nanowire field-effect transistor", + 1039: "c-terminal range", + 1040: "n-terminal range", + 1041: "synonym", + 1042: "pubmed central", + 1043: "flannotator", + 1044: "rice genome annotation project", + 1045: "curation content", + 1046: "interacting molecules", + 1047: "protein-protein", + 1048: "smallmolecule-protein", + 1049: "nucleicacid-protein", + 1050: "interaction representation", + 1051: "evidence", + 1052: "clustered", + 1053: "data source", + 1054: "experimentally-observed", + 1055: "internally-curated", + 1056: "text-mining", + 1057: "predicted", + 1058: "imported", + 1059: "complex expansion", + 1060: "spoke expansion", + 1061: "matrix expansion", + 1062: "bipartite expansion", + 1063: "consensuspathdb", + 1064: "interaction confidence", + 1065: "replication-based confidence", + 1066: "structure-based confidence", + 1067: "function-based confidence", + 1068: "location-based confidence", + 1069: "network-based confidence", + 1070: "standard-based confidence", + 1071: "literature-based confidence", + 1072: "method-based confidence", + 1073: "statistical-based confidence", + 1074: "rgs-his tag", + 1075: "beilstein", + 1076: "einecs", + 1077: "merck index", + 1078: "plantgdb", + 1079: "ratmap", + 1080: "tair", + 1081: "tigr/jcvi", + 1082: "zfin", + 1083: "cog", + 1084: "photon donor", + 1085: "photon acceptor", + 1086: "equilibrium dialysis", + 1087: "monoclonal antibody blockade", + 1088: "phenotype-based detection assay", + 1089: "nuclear translocation assay", + 1090: "bimane label", + 1091: "publication title", + 1092: "atto label", + 1093: "bibliographic attribute name", + 1094: "genome databases", + 1095: "hgnc", + 1096: "protein sequence databases", + 1097: "uniprot", + 1098: "uniprot/swiss-prot", + 1099: "uniprot/trembl", + 1100: "bioactive entity", + 1101: "standard inchi key", + 1102: "mapped-identity", + 1103: "solution state nmr", + 1104: "solid state nmr", + 1105: "biocyc", + 1106: "pathways database", + 1107: "pid", + 1108: "biocarta", + 1109: "gene database", + 1110: "predicted interaction", + 1111: "two hybrid bait or prey pooling approach", + 1112: "two hybrid prey pooling approach", + 1113: "two hybrid bait and prey pooling approach", + 1114: "virhostnet", + 1115: "spike", + 1116: "genemania", + 1117: "topfind", + 1118: "enhanced yellow fluorescent protein tag", + 1119: "nYFP", + 1120: "cYFP", + 1121: "ceYFP", + 1122: "neYFP", + 1123: "bindingdb", + 1124: "pathwaycommons", + 1125: "direct binding region", + 1126: "self interaction", + 1127: "putative self interaction", + 1128: "mutation disrupting interaction strength", + 1129: "mutation disrupting interaction rate", + 1130: "mutation decreasing interaction rate", + 1131: "mutation increasing interaction rate", + 1132: "mutation increasing interaction strength", + 1133: "mutation decreasing interaction strength", + 1134: "mcherry fluorescent protein tag", + 1135: "venus fluorescent protein tag", + 1136: "kusabira-green protein tag", + 1137: "carboxylation assay", + 1138: "decarboxylation assay", + 1139: "carboxylation reaction", + 1140: "decarboxylation reaction", + 1141: "s tag", + 1142: "aminoacylation assay", + 1143: "aminoacylation reaction", + 1144: "protein a tag visualisation", + 1145: "phospholipase assay", + 1146: "phospholipase reaction", + 1147: "ampylation assay", + 1148: "ampylation reaction", + 1149: "cooperative interaction", + 1150: "affected interaction", + 1151: "participant-ref", + 1152: "cooperative effect value", + 1153: "cooperative effect outcome", + 1154: "positive cooperative effect", + 1155: "negative cooperative effect", + 1156: "cooperative mechanism", + 1157: "allostery", + 1158: "pre-assembly", + 1159: "allosteric molecule", + 1160: "allosteric effector", + 1161: "allosteric response", + 1162: "allosteric k-type response", + 1163: "allosteric v-type response", + 1164: "allosteric mechanism", + 1165: "allosteric change in structure", + 1166: "allosteric change in dynamics", + 1167: "allostery type", + 1168: "heterotropic allostery", + 1169: "homotropic allostery", + 1170: "pre-assembly response", + 1171: "composite binding site formation", + 1172: "altered physicochemical compatibility", + 1173: "binding site hiding", + 1174: "configurational pre-organization", + 1175: "allosteric post-translational modification", + 1176: "sequence based prediction of gene regulatory region binding sites", + 1177: "phylogenetic profiling of predicted gene regulatory region binding sites", + 1178: "sequence based prediction of binding of transcription factor to transcribed gene regulatory elements", + 1179: "partial nucleotide sequence identification", + 1180: "partial DNA sequence identification", + 1181: "paired end tags sequence identification", + 1182: "full identification by RNA sequencing", + 1183: "nuclease footprinting", + 1184: "dna adenine methyltransferase identification", + 1185: "tag visualisation by dna adenine methyltransferase", + 1186: "dna methyltransferase tag", + 1187: "damip", + 1188: "tag visualisation by mutated dna adenine methyltransferase", + 1189: "methylation interference assay", + 1190: "hydroxy radical footprinting", + 1191: "ultraviolet (uv) footprinting", + 1192: "antisense oligonucleotides", + 1193: "partial RNA sequence identification", + 1194: "reverse transcription pcr", + 1195: "quantitative pcr", + 1196: "quantitative reverse transcription pcr", + 1197: "radioimmunoassay", + 1198: "immunohistochemistry", + 1199: "anti-tag immunohistochemistry", + 1200: "immunocytochemistry", + 1201: "anti-tag immunocytochemistry", + 1202: "one-strep-tag", + 1203: "split luciferase complementation", + 1204: "split firefly luciferase complementation", + 1205: "luciferase tag", + 1206: "renilla-n", + 1207: "renilla-c", + 1208: "firefly luciferase protein tag", + 1209: "firefly-c", + 1210: "firefly-n", + 1211: "liposome binding assay", + 1212: "checksum", + 1213: "n-venus", + 1214: "c-venus", + 1215: "bifc tag", + 1216: "cGFP", + 1217: "nGFP", + 1218: "chromosome conformation capture assay", + 1219: "enzyme-mediated activation of radical sources", + 1220: "tag visualisation by luciferase assay", + 1221: "author-based confidence", + 1222: "mbinfo", + 1223: "ptm decreasing an interaction", + 1224: "ptm increasing an interaction", + 1225: "ptm disrupting an interaction", + 1226: "ampylation assay", + 1227: "lap tag", + 1228: "pyo tag", + 1229: "uridylation assay", + 1230: "uridylation reaction", + 1231: "aminomethylcoumarin label", + 1232: "aggregation assay", + 1233: "resulting-cleavage", + 1234: "silac", + 1235: "thermal shift binding", + 1236: "proline isomerase assay", + 1237: "proline isomerization reaction", + 1238: "mass spectrometry studies of subunit exchange", + 1239: "amino-acid variant", + 1240: "disease causing amino-acid variant", + 1241: "variant", + 1242: "fc-igg1", + 1243: "fc-igg2", + 1244: "mkate2", + 1245: "mkate", + 1246: "ion mobility mass spectrometry of complexes", + 1247: "microscale thermophoresis", + 1248: "bodipy label", + 1249: "isomerase assay", + 1250: "isomerase reaction", + 1251: "methylmalonyl-CoA isomerase reaction", + 1252: "methylmalonyl-CoA isomerase asf say", + 1253: "atto 532", + 1254: "atto 647", + 1255: "stilbene label", + 1256: "luminscent dye label", + 1257: "rhodamine label", + 1258: "tetramethyl rhodamine label", + 1259: "acrylodan label", + 1260: "pyrene label", + 1261: "oregon green label", + 1262: "iid", + 1263: "molecular connections", + 1264: "ntnu", + 1265: "ubiquitin reconstruction tag", + 1266: "cub", + 1267: "nub", + 1268: "nubg", + 1269: "duplicated protein", + 1270: "xpress tag", + 1271: "enhancement", + 1272: "positive epistasis", + 1273: "maximal epistasis", + 1274: "minimal epistasis", + 1275: "neutral epistasis", + 1276: "opposing epistasis", + 1277: "qualitative epistasis", + 1278: "mutual enhancement", + 1279: "unilateral enhancement", + 1280: "mutual suppression", + 1281: "mutual suppression (complete)", + 1282: "mutual suppression (partial)", + 1283: "suppression-enhancement", + 1284: "quantitative epistasis", + 1285: "opposing epistasis", + 1286: "over-suppression", + 1287: "mutual over-suppression", + 1288: "over-suppression-enhancement", + 1289: "phenotype bias", + 1290: "suppression (complete)", + 1291: "suppression (partial)", + 1292: "unilateral suppression", + 1293: "unilateral suppression (complete)", + 1294: "unilateral suppression (partial)", + 1295: "unilateral over-suppression", + 1296: "amino acid analysis", + 1297: "phosphoamino acid analysis", + 1298: "complex type", + 1299: "complex composition", + 1300: "obligate complex", + 1301: "non-obligate complex", + 1302: "stable complex", + 1303: "transient complex", + 1304: "molecule set", + 1305: "candidate set", + 1306: "open set", + 1307: "defined set", + 1308: "resulting sequence", + 1309: "de-ADP-ribosylation assay", + 1310: "de-ADP-ribosylation reaction", + 1311: "differential scanning calorimetry", + 1312: "aut-page", + 1313: "proximity labelling technology", + 1314: "proximity-dependent biotin identification", + 1315: "complex recommended name", + 1316: "complex systematic name", + 1317: "eukaryotic linear motif resource", + 1318: "sulfate donor", + 1319: "sulfate acceptor", + 1320: "membrane yeast two hybrid", + 1321: "ire1 reconstruction", + 1322: "atto 465", + 1323: "tag visualisation by alkaline phosphatase activity", + 1324: "conditioned medium", + 1325: "sulfurtransferase assay", + 1326: "CLONE OF phosphotransfer reaction", + 1327: "sulfurtransfer reaction", + 1328: "coumarin label", + 1329: "cpm", + 1330: "dnp", + 1331: "evidence ontology", + 1332: "bhf-ucl", + 1333: "rogid", + 1334: "rigid", + 1335: "hpidb", + 1336: "experiment database", + 1337: "efo", + 1338: "eef tag", + 1339: "supercharged green fluorescent protein", + 1340: "human orfeome collection", + 1341: "set member", + 1342: "qcmd", + 1343: "enzyme regulator", + 1344: "erythrosin iodoacetamide label", + 1345: "rho tag", + 1346: "bmrb", + 1347: "protein ontology", + 1348: "chembl target", + 1349: "chembl", + 1350: "orphanet", + 1351: "inferred-from", + 1352: "uracil interference assay", + 1353: "au5 tag", + 1354: "lipase assay", + 1355: "lipid cleavage", + 1356: "validated two hybrid", + 1357: "RNAcentral", + 2002: "drugbank", + 2003: "commercial name", + 2004: "drug brand name", + 2005: "drug mixture brand name", + 2006: "biotech product preparation", + 2007: "iupac name", + 2008: "chemical formula", + 2009: "chemical structure", + 2010: "standard inchi", + 2011: "cas registry number", + 2012: "kegg compound", + 2013: "pubchem", + 2015: "pharmgkb", + 2016: "bind smid", + 2017: "heterogen", + 2020: "canadian drug identification number", + 2021: "rxlist link", + 2023: "material safety data sheet", + 2024: "patent number", + 2025: "molecular weight", + 2026: "melting point", + 2027: "water solubility", + 2029: "logp", + 2030: "isoelectric point", + 2033: "hydrophobicity", + 2036: "boiling point", + 2039: "smiles string", + 2040: "drug type", + 2041: "drug category", + 2042: "disease indication", + 2043: "pharmacology", + 2044: "mechanism of action", + 2045: "drug absorption", + 2046: "lethal dose 50", + 2047: "percentage of plasma protein binding", + 2048: "drug biotransformation", + 2049: "elimination half life", + 2050: "dosage form", + 2051: "patient information", + 2053: "contraindications", + 2054: "bioactive entity reference", + 2055: "chemical stability", + 2064: "solubility", + 2084: "organisms affected", + 2086: "physicochemical attribute name", + 2089: "bioactive entity attribute name", + 2091: "structure representation attribute name", + 2097: "anti-convulsant", + 2098: "anti-bacterial", + 2099: "fda approved drug", + 2100: "experimental drug", + 2101: "biotech drug", + 2102: "nutraceutical drug", + 2105: "pka", + 2106: "degree of ionisation ph 7.4", + 2107: "logd", + 2108: "solubility ph 7.4", + 2109: "solubility in dmso", + 2111: "diffusion coefficient", + 2112: "chemical stability at pH 2", + 2113: "dissolution profile", + 2115: "pharmacokinetics attribute name", + 2116: "cell permeability", + 2118: "volume of distribution", + 2120: "tissue distribution", + 2121: "transporter binding", + 2122: "clearance", + 2123: "renal clearance", + 2124: "total clearance", + 2125: "maximum absorbable dose", + 2126: "paracellular absorption", + 2127: "tmax/cmax", + 2128: "ABCB1 transporter substrate", + 2129: "bile transporter substrate", + 2130: "cyp-450 inhibition", + 2131: "metabolite identification", + 2132: "gsh adducts", + 2133: "neutralization by glucuronidation or sulfatation", + 2135: "toxicity attribute name", + 2136: "herg binding", + 2137: "genotoxicity", + 2138: "mutagenicity", + 2139: "carcinogenicity", + 2140: "chromosome damage", + 2141: "hepatotoxicity", + 2142: "phospholipidosis", + 2145: "solubility ph 6.5", + 2146: "solubility ph 2.0", + 2147: "chemical stability at pH 7.4", + 2148: "investigational drug", + 2149: "withdrawn drug", + 2150: "illicit drug", + 2151: "other drug interaction", + 2152: "food interaction", + 2153: "pdr health", + 2154: "wikipedia", + 2155: "average molecular weight", + 2156: "monoisotopic molecular weight", + 2157: "experimental water solubility", + 2158: "predicted water solubility", + 2160: "logs", + 2161: "experimental logs", + 2162: "experimental CaCO2 permeability", + 2163: "by homology", + 2164: "mind", + 2165: "bar", + 2166: "ai", + 2167: "kinetic exclusion assay", + 2168: "conditional site labelling", + 2169: "luminiscence technology", + 2170: "bimolecular luminiscence complementation", + 2171: "complemented donor-acceptor resonance energy transfer", + 2172: "aspgd", + 2173: "cgd", + 2174: "ecoliwiki", + 2175: "genedb", + 2176: "gramene", + 2177: "pombase", + 2178: "agi_locuscode", + 2179: "subset", + 2180: "agbase", + 2181: "cacao", + 2182: "dflat", + 2183: "go_central", + 2184: "mtbbase", + 2185: "parkinsonsuk-ucl", + 2186: "alut", + 2187: "ri", + 2188: "par-clip", + 2189: "avexis", + 2190: "long non-coding ribonucleic acid", + 2191: "clip", + 2192: "clip-seq", + 2193: "iclip", + 2194: "crac", + 2195: "clash", + 2196: "quartz crystal microbalance", + 2197: "probe interaction assay", + 2198: "labelling assay", + 2199: "specific site-labelling technology", + 2200: "primesdb", + 2201: "DNA chemical modification", + 2202: "RNA chemical modification", + 2203: "primer extension assay", + 2204: "micro rna", + 2205: "pir", + 2206: "observed nucleic acid chemical modification", + 2207: "resulting nucleic acid chemical modification", + 2208: "prerequisite-nucleic acid chemical modification", + 2209: "nucleic acid chemical modification decreasing an interaction", + 2210: "nucleic acid chemical modification disrupting an interaction", + 2211: "nucleic acid chemical modification increasing an interaction", + 2212: "proteomexchange", + 2213: "super-resolution microscopy", + 2214: "signor", + 2215: "barcode fusion genetics two hybrid", + 2216: "deampylation assay", + 2217: "luciferase-c", + 2218: "luciferase-n", + 2219: "gaussia luciferase protein tag", + 2220: "gaussia-c", + 2221: "gaussia-n", + 2222: "inference by socio-affinity scoring", + 2223: "inference by quantitative co-purification", + 2224: "chemical rna modification plus base pairing prediction", + 2225: "zinc", + 2226: "mutation with no effect", + 2227: "mutation causing an interaction", + 2228: "ceitec", + 2229: "nucleicacid-gene", + 2230: "nucleicacid-nucleicacid", + 2231: "coexpression", + 2232: "molecular association", + 2233: "causal interaction", + 2234: "causal statement", + 2235: "up-regulates", + 2236: "up-regulates activity", + 2237: "up-regulates quantity", + 2238: "up-regulates quantity by expression", + 2239: "up-regulates quantity by stabilization", + 2240: "down-regulates", + 2241: "down-regulates activity", + 2242: "down-regulates quantity", + 2243: "down-regulates quantity by repression", + 2244: "down-regulates quantity by destabilization", + 2245: "causal regulatory mechanism", + 2246: "indirect causal regulation", + 2247: "transcriptional regulation", + 2248: "translation regulation", + 2249: "post transcriptional regulation", + 2250: "direct causal regulation", + 2251: "transcriptional regulation by direct binding of dbTF to DNA regulatory element", + 2252: "guanine nucleotide exchange factor reaction", + 2253: "gtpase-activating protein reaction", + 2254: "chemical activation reaction", + 2255: "chemical inhibition reaction", + 2256: "relocalization", + 2257: "small molecule catalysis reaction", + 2258: "xenobiotic", + 2259: "causal interactor type", + 2260: "stimulus", + 2261: "phenotype", + 2262: "causal regulatory modification", + 2263: "s-nitrosylation", + 2264: "tyrosinated residue", + 2265: "de-acetylated residue", + 2266: "de-phosphorylated residue", + 2267: "de-sumoylated residue", + 2268: "de-methylated residue", + 2269: "de-ubiquitinylated residue", + 2270: "signalink", + 2271: "edam", + 2272: "tyrosinylation", + 2273: "tyrosination", + 2274: "regulator", + 2275: "regulator target", + 2276: "carbohydrate chemical modification", + 2277: "Cr-two hybrid", + 2278: "polymer chain length", + 2279: "complex portal", + 2280: "deamidation reaction", + 2281: "deamidation assay", + 2282: "complex-primary", + 2283: "southwestern blotting", + 2284: "complex component", + 2285: "miRNA interference luciferase reporter assay", + 2286: "functional association", + 2287: "identification by structure determination", + 2288: "DAP-seq", +} diff --git a/ebel/manager/neo4j/n4j_importer.py b/ebel/manager/neo4j/n4j_importer.py index 2fecf57..a875f4b 100644 --- a/ebel/manager/neo4j/n4j_importer.py +++ b/ebel/manager/neo4j/n4j_importer.py @@ -3,6 +3,7 @@ import re import json import logging +import traceback from collections import defaultdict from tqdm import tqdm @@ -109,20 +110,30 @@ def insert_statements_and_sets(self, statements_and_sets: dict) -> int: annotation.pop(anno_keyword, None) elif dtype == "statement" and len(data) >= 1: + try: + _, subj_class, subject_id = self.get_node_id(data[0]['subject']) + if len(data) > 1 and 'object' in data[2]: + # TODO: nested statements are missing + #print(data) + _, obj_class, object_id = self.get_node_id(data[2]['object']) - _, subj_class, subject_id = self.get_node_id(data[0]['subject']) + relation = data[1]['relation'] + neo4j_relation_class = edge_map[relation] - if len(data) > 1 and 'object' in data[2]: - # TODO: nested statements are missing - - _, obj_class, object_id = self.get_node_id(data[2]['object']) - - relation = data[1]['relation'] - neo4j_relation_class = edge_map[relation] - - new_edges += self.insert_bel_edge(annotation, citation, evidence, pmid, neo4j_relation_class, - subject_id, object_id) + new_edges += self.insert_bel_edge(annotation, citation, evidence, pmid, neo4j_relation_class, + subject_id, object_id) + else: + print(data[2]) + logger.warning(f"The following couldn't be imported {data}") + print(data) + + except Exception as e: + import traceback + traceback.print_exc() + print(pmid) + print(data) + return new_edges @staticmethod diff --git a/ebel/manager/neo4j/n4j_structure.py b/ebel/manager/neo4j/n4j_structure.py index 33627ac..05e1b9d 100644 --- a/ebel/manager/neo4j/n4j_structure.py +++ b/ebel/manager/neo4j/n4j_structure.py @@ -52,6 +52,8 @@ "has_components": "HAS_COMPONENTS", "has_member": "HAS_MEMBER", "has_members": "HAS_MEMBERS", + "has_modification": "HAS_MODIFICATION", + "includes": "INCLUDES", "increases": "INCREASES", "is_a": "IS_A", "negative_correlation": "NEGATIVE_CORRELATION", diff --git a/ebel/manager/orientdb/biodbs/bel.py b/ebel/manager/orientdb/biodbs/bel.py index d32380a..10c5106 100644 --- a/ebel/manager/orientdb/biodbs/bel.py +++ b/ebel/manager/orientdb/biodbs/bel.py @@ -1,43 +1,46 @@ """The hub for all things BEL.""" -import os import logging - -from tqdm import tqdm +import os from collections import namedtuple -from typing import Iterable, Union, Set, Dict, Optional +from typing import Dict, Iterable, Optional, Set, Union + from pyorientdb.exceptions import PyOrientCommandException +from tqdm import tqdm -from ebel.manager.orientdb.odb_meta import Graph -from ebel.constants import SPECIES_NAMESPACE, RID -from ebel.manager.orientdb.importer import _BelImporter +from ebel.constants import RID, SPECIES_NAMESPACE from ebel.manager.orientdb import odb_meta, odb_structure -from ebel.manager.constants import bel_func_short -from ebel.manager.orientdb.constants import DRUGBANK, EXPRESSION_ATLAS, HGNC, CHEBI, ENSEMBL, GWAS_CATALOG, CLINVAR, \ - UNIPROT, REACTOME, STRINGDB, INTACT, BIOGRID, MIRTARBASE, PATHWAY_COMMONS, DISGENET, KEGG, IUPHAR, NSIDES, \ - CLINICAL_TRIALS, PROTEIN_ATLAS, NCBI - -from ebel.manager.orientdb.biodbs.hgnc import Hgnc -from ebel.manager.orientdb.biodbs.kegg import Kegg -from ebel.manager.orientdb.biodbs.ncbi import Ncbi +from ebel.manager.orientdb.biodbs.biogrid import BioGrid from ebel.manager.orientdb.biodbs.chebi import Chebi -from ebel.manager.orientdb.biodbs.intact import IntAct -from ebel.manager.orientdb.biodbs.nsides import Nsides -from ebel.manager.orientdb.biodbs.iuphar import Iuphar +from ebel.manager.orientdb.biodbs.clinical_trials import ClinicalTrials from ebel.manager.orientdb.biodbs.clinvar import ClinVar -from ebel.manager.orientdb.biodbs.uniprot import UniProt -from ebel.manager.orientdb.biodbs.biogrid import BioGrid -from ebel.manager.orientdb.biodbs.ensembl import Ensembl from ebel.manager.orientdb.biodbs.disgenet import DisGeNet -from ebel.manager.orientdb.biodbs.reactome import Reactome -from ebel.manager.orientdb.biodbs.stringdb import StringDb from ebel.manager.orientdb.biodbs.drugbank import DrugBank -from ebel.manager.orientdb.biodbs.mirtarbase import MirTarBase +from ebel.manager.orientdb.biodbs.ensembl import Ensembl +from ebel.manager.orientdb.biodbs.expression_atlas import ExpressionAtlas from ebel.manager.orientdb.biodbs.gwas_catalog import GwasCatalog -from ebel.manager.orientdb.biodbs.protein_atlas import ProteinAtlas -from ebel.manager.orientdb.biodbs.clinical_trials import ClinicalTrials +from ebel.manager.orientdb.biodbs.hgnc import Hgnc +from ebel.manager.orientdb.biodbs.intact import IntAct +from ebel.manager.orientdb.biodbs.iuphar import Iuphar +from ebel.manager.orientdb.biodbs.kegg import Kegg +from ebel.manager.orientdb.biodbs.mirtarbase import MirTarBase +from ebel.manager.orientdb.biodbs.ncbi import Ncbi +from ebel.manager.orientdb.biodbs.nsides import Nsides from ebel.manager.orientdb.biodbs.pathway_commons import PathwayCommons -from ebel.manager.orientdb.biodbs.expression_atlas import ExpressionAtlas - +from ebel.manager.orientdb.biodbs.protein_atlas import ProteinAtlas +from ebel.manager.orientdb.biodbs.reactome import Reactome +from ebel.manager.orientdb.biodbs.stringdb import StringDb +from ebel.manager.orientdb.biodbs.uniprot import UniProt +from ebel.manager.orientdb.constants import (BIOGRID, CHEBI, CLINICAL_TRIALS, + CLINVAR, DISGENET, DRUGBANK, + ENSEMBL, EXPRESSION_ATLAS, + GWAS_CATALOG, HGNC, INTACT, + IUPHAR, KEGG, MIRTARBASE, NCBI, + NSIDES, PATHWAY_COMMONS, + PROTEIN_ATLAS, REACTOME, STRINGDB, + UNIPROT) +from ebel.manager.orientdb.importer import _BelImporter +from ebel.manager.orientdb.odb_defaults import bel_func_short +from ebel.manager.orientdb.odb_meta import Graph logger = logging.getLogger(__name__) @@ -97,7 +100,7 @@ def __init__(self, graph_config: Optional[dict] = None, overwrite_config: bool = edges, indices, config_params=graph_config, - overwrite_config=overwrite_config + overwrite_config=overwrite_config, ) @property @@ -247,14 +250,16 @@ def nsides(self) -> Nsides: self.__nsides = Nsides() return self.__nsides - def import_json(self, - input_path: Union[str, Iterable[str]], - extend_graph: bool = True, - update_from_protein2gene: bool = True, - skip_drugbank: bool = False, - drugbank_user: str = None, - drugbank_password: str = None, - include_subfolders: bool = False) -> list: + def import_json( + self, + input_path: Union[str, Iterable[str]], + extend_graph: bool = True, + update_from_protein2gene: bool = True, + skip_drugbank: bool = False, + drugbank_user: Optional[str] = None, + drugbank_password: Optional[str] = None, + include_subfolders: bool = False, + ) -> list: """Import BEL JSON file(s) into OrientDB. Parameters @@ -331,9 +336,11 @@ def import_json(self, return inserted_files - def enrich_network(self, - include: Union[str, Iterable[str]] = [], - skip: Union[str, Iterable[str]] = []) -> Set[str]: + def enrich_network( + self, + include: Union[str, Iterable[str]] = [], + skip: Union[str, Iterable[str]] = [], + ) -> Set[str]: """Add all external resources to the network. Parameters @@ -372,11 +379,11 @@ def enrich_network(self, CLINICAL_TRIALS: self.clinical_trials, PROTEIN_ATLAS: self.protein_atlas, NCBI: self.ncbi, - EXPRESSION_ATLAS: self.expression_atlas + # EXPRESSION_ATLAS: self.expression_atlas, # TODO: Skipped because file with all results is no longer supported } # calc all sets (what is used, missing in skip and include) - include = [] if include == ['[]'] else include # fixes problems in python3.9/docker - skip = [] if skip == ['[]'] else skip # fixes problems in python3.9/docker + include = [] if include == ["[]"] else include # fixes problems in python3.9/docker + skip = [] if skip == ["[]"] else skip # fixes problems in python3.9/docker include_set = {include.lower()} if isinstance(include, str) else set([x.lower() for x in include]) skip_set = {skip.lower()} if isinstance(skip, str) else set([x.lower() for x in skip]) biodb_set = set([x.lower() for x in biodb_updaters.keys()]) @@ -395,7 +402,7 @@ def enrich_network(self, db_names = [x for x in biodb_updaters if x in used_biodb_set] for db_name in db_names: - print(f'Enrich network - {db_name.upper()}') + print(f"Enrich network - {db_name.upper()}") db: odb_meta.Graph = biodb_updaters[db_name] db.update() logger.info(f"Enrichment with {db_name} completed.") @@ -406,14 +413,14 @@ def enrich_network(self, @property def pure_protein_rid_dict(self) -> dict: """Get pure protein/rid dictbel. where name is the key and rid is the value.""" - pureprots = self.query_class(class_name='protein', columns=['name'], with_rid=True, pure=True) # List of dicts - return {prot['name']: prot[RID] for prot in pureprots} + pureprots = self.query_class(class_name="protein", columns=["name"], with_rid=True, pure=True) # List of dicts + return {prot["name"]: prot[RID] for prot in pureprots} def _update_species(self) -> dict: """Add species taxon ID to bel nodes.""" namespaces_updated = {} - for molec_state in ['protein', 'rna', 'gene']: + for molec_state in ["protein", "rna", "gene"]: for namespace, species_id in SPECIES_NAMESPACE.items(): update_pure_bio_sql = f"""UPDATE {molec_state} SET species = {species_id} @@ -435,7 +442,7 @@ def _update_species(self) -> dict: 'has__gene', 'has__rna') FROM {rid}) WHERE @class in ['protein','rna','gene']""" namespace_results = self.execute(sql) - ns_set = {x.oRecordData['namespace'] for x in namespace_results if namespace_results} + ns_set = {x.oRecordData["namespace"] for x in namespace_results if namespace_results} if len(ns_set) == 1: ns = list(ns_set)[0] @@ -450,7 +457,10 @@ def _update_from_protein2gene(self) -> Dict[str, int]: """Adds translated_to and transcribed_to to pure=true proteins and rnas id not exists.""" added_translated_to = self._add_missing_translated_to_edges() added_transcribed_to = self._add_missing_transcribed_to_edges() - return {'added_translated_to': added_translated_to, 'added_transcribed_to': added_transcribed_to} + return { + "added_translated_to": added_translated_to, + "added_transcribed_to": added_transcribed_to, + } def _create_and_tag_pure(self): """Create pure gene, RNA, micro_rna, abundance, complex (as abundance) and protein objects (if not exists). @@ -485,11 +495,13 @@ def _tag_pure(self) -> int: def _create_pure_nodes_to_modified(self) -> int: """Create all has_modified_(protein|gene) edges in OrientDB (proteins without a pure counterpart).""" - edge_classes = {'has__pmod': "has_modified_protein", - 'has__gmod': "has_modified_gene", - 'has__fragment': "has_fragmented_protein", - 'has__variant': "has_variant_{}", - 'has__location': "has_located_{}"} + edge_classes = { + "has__pmod": "has_modified_protein", + "has__gmod": "has_modified_gene", + "has__fragment": "has_fragmented_protein", + "has__variant": "has_variant_{}", + "has__location": "has_located_{}", + } results = {} sql = """Select @@ -508,11 +520,11 @@ def _create_pure_nodes_to_modified(self) -> int: r = row.oRecordData if r: # Might return an empty result, but don't know until we look at oRecordData - namespace = r['namespace'] - name = r['name'] - class_name = r['class_name'] + namespace = r["namespace"] + name = r["name"] + class_name = r["class_name"] - if '{}' in class_name_from_pure: + if "{}" in class_name_from_pure: cname_from_pure = class_name_from_pure.format(class_name) else: cname_from_pure = class_name_from_pure @@ -520,19 +532,22 @@ def _create_pure_nodes_to_modified(self) -> int: bel_function = bel_func_short[class_name] bel = f'{bel_function}({namespace}:"{name}")' - data = {'namespace': namespace, - 'name': name, - 'pure': True, - 'bel': bel - } + data = { + "namespace": namespace, + "name": name, + "pure": True, + "bel": bel, + } - from_rid = self.get_create_rid(class_name=class_name, value_dict=data, check_for='bel') + from_rid = self.get_create_rid(class_name=class_name, value_dict=data, check_for="bel") to_rid = r[RID] - self.create_edge(class_name=cname_from_pure, - from_rid=from_rid, - to_rid=to_rid, - if_not_exists=True) + self.create_edge( + class_name=cname_from_pure, + from_rid=from_rid, + to_rid=to_rid, + if_not_exists=True, + ) created += 1 return created @@ -568,20 +583,23 @@ def __update_involved(self, node_class) -> Dict[str, int]: where @class not in ['protein','gene','rna'] and name is not null)""" updated_involved_other += self.execute(sql)[0] - return {'updated_involved_genes': updated_involved_genes, 'updated_involved_other': updated_involved_other} + return { + "updated_involved_genes": updated_involved_genes, + "updated_involved_other": updated_involved_other, + } def _update_involved(self) -> Dict[str, int]: """Update involved genes and others.""" - result = {'updated_involved_genes': 0, 'updated_involved_other': 0} - for node_class in ['bel', 'reactants', 'products']: + result = {"updated_involved_genes": 0, "updated_involved_other": 0} + for node_class in ["bel", "reactants", "products"]: sub_result = self.__update_involved(node_class) - result['updated_involved_genes'] += sub_result['updated_involved_genes'] - result['updated_involved_other'] += sub_result['updated_involved_other'] + result["updated_involved_genes"] += sub_result["updated_involved_genes"] + result["updated_involved_other"] += sub_result["updated_involved_other"] return result def _add_missing_has_variant_edges(self): # TODO: Implement this completely - ModifiedProtein = namedtuple('ModifiedProtein', ['ns', 'name', 'rids']) + ModifiedProtein = namedtuple("ModifiedProtein", ["ns", "name", "rids"]) modified_proteins_sql = """Select list(@rid.asString()) as rids, name, @@ -596,7 +614,7 @@ def _add_missing_has_variant_edges(self): group by name, namespace""" results = [r.oRecordData for r in self.execute(modified_proteins_sql)] - modified_proteins = [ModifiedProtein(r['ns'], r['name'], r['rids']) for r in results] + modified_proteins = [ModifiedProtein(r["ns"], r["name"], r["rids"]) for r in results] for modified_protein in modified_proteins: pass @@ -604,18 +622,20 @@ def _add_missing_has_variant_edges(self): def _add_missing_translated_to_edges(self) -> int: """Add missing RNAs to proteins and translated_to edges.""" return self.__add_missing_edges( - from_class='rna', - to_class='protein', - edge_name='translated_to', - bel_function='r') + from_class="rna", + to_class="protein", + edge_name="translated_to", + bel_function="r", + ) def _add_missing_transcribed_to_edges(self) -> int: """Add missing genes to RNAs and transcribed_to edges.""" return self.__add_missing_edges( - from_class='gene', - to_class='rna', - edge_name='transcribed_to', - bel_function='g') + from_class="gene", + to_class="rna", + edge_name="transcribed_to", + bel_function="g", + ) def __add_missing_edges(self, from_class, to_class, edge_name, bel_function) -> int: added = 0 @@ -636,15 +656,18 @@ def __add_missing_edges(self, from_class, to_class, edge_name, bel_function) -> for to_class_node in tqdm(self.execute(sql), desc=f"Adding {edge_name} edges"): p = to_class_node.oRecordData bel = '{bel_function}({ns}:"{name}")'.format( - ns=p['namespace'], - name=p['name'], - bel_function=bel_function + ns=p["namespace"], name=p["name"], bel_function=bel_function + ) + from_rid = self.get_create_rid( + from_class, + { + "namespace": p["namespace"], + "name": p["name"], + "pure": True, + "bel": bel, + }, + check_for="bel", ) - from_rid = self.get_create_rid(from_class, - {'namespace': p['namespace'], - 'name': p['name'], - 'pure': True, - 'bel': bel}, check_for='bel') self.create_edge(class_name=edge_name, from_rid=from_rid, to_rid=p[RID]) added += 1 diff --git a/ebel/manager/orientdb/biodbs/biogrid.py b/ebel/manager/orientdb/biodbs/biogrid.py index ed96e81..c2e7e1c 100644 --- a/ebel/manager/orientdb/biodbs/biogrid.py +++ b/ebel/manager/orientdb/biodbs/biogrid.py @@ -1,32 +1,34 @@ """BioGrid.""" import typing -import numpy as np -import pandas as pd from enum import Enum +from typing import Dict, Tuple -from tqdm import tqdm +import numpy as np +import pandas as pd from pyorientdb import OrientDB -from typing import Dict, Tuple +from tqdm import tqdm from ebel import tools -from ebel.manager.orientdb.constants import BIOGRID from ebel.manager.orientdb import odb_meta, odb_structure, urls -from ebel.manager.constants import normalized_pmod_reverse, BelPmod - +from ebel.manager.orientdb.constants import BIOGRID +from ebel.manager.orientdb.odb_defaults import BelPmod, normalized_pmod_reverse from ebel.manager.rdbms.models import biogrid -STANDARD_NAMESPACES = { - 9606: 'HGNC', - 10090: 'MGI', - 10116: 'RGD' -} +STANDARD_NAMESPACES = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} class BioGridNode: """Custom class definition for BioGRID nodes.""" - def __init__(self, rid: str, symbol: str, uniprot: str, taxonomy_id: int, pmod_bel: str = None): + def __init__( + self, + rid: str, + symbol: str, + uniprot: str, + taxonomy_id: int, + pmod_bel: str = None, + ): """Init for BioGRID generated nodes.""" self.rid = rid self.symbol: str = symbol @@ -40,7 +42,7 @@ def __get_nn(self) -> Tuple[str, str]: namespace = STANDARD_NAMESPACES[self.taxonomy_id] name = self.symbol else: - namespace = 'UNIPROT' + namespace = "UNIPROT" name = self.uniprot return namespace, name @@ -50,10 +52,12 @@ def __get_protein_as_value_dict(self, pmod: bool = False): else: bel = self.get_protein_bel() - value_dict = {'name': self.name, - 'namespace': self.namespace, - 'bel': bel, - 'uniprot': self.uniprot} + value_dict = { + "name": self.name, + "namespace": self.namespace, + "bel": bel, + "uniprot": self.uniprot, + } return value_dict def get_pmod_protein_as_value_dict(self): @@ -74,25 +78,52 @@ def get_protein_bel(self) -> str: def get_pmod_bel(self): """Get the protein modification for a BEL.""" - return {'bel': f"pmod({self.pmod_bel})", 'type': normalized_pmod_reverse[self.pmod_bel]} + return { + "bel": f"pmod({self.pmod_bel})", + "type": normalized_pmod_reverse[self.pmod_bel], + } class BioGridEdge: """Class definition for BioGRID edges.""" - def __init__(self, subject_rid: str, subject_symbol: str, subject_uniprot: str, subject_taxonomy_id: int, - modification: str, object_rid: str, object_symbol: str, object_uniprot: str, object_taxonomy_id: int, - experimental_system: str, pmids: str, num_pubs: int, dois: str, biogrid_ids: str): + def __init__( + self, + subject_rid: str, + subject_symbol: str, + subject_uniprot: str, + subject_taxonomy_id: int, + modification: str, + object_rid: str, + object_symbol: str, + object_uniprot: str, + object_taxonomy_id: int, + experimental_system: str, + pmids: str, + num_pubs: int, + dois: str, + biogrid_ids: str, + ): """Init for BioGRID edges.""" mods = Modification.get_reverse_dict() self.modification = modification self.modConfig: ModConfig = mods[modification] - self.subj: BioGridNode = BioGridNode(subject_rid, subject_symbol, subject_uniprot, subject_taxonomy_id, - self.modConfig.pmod_bel) - self.obj: BioGridNode = BioGridNode(object_rid, object_symbol, object_uniprot, object_taxonomy_id, - self.modConfig.pmod_bel) + self.subj: BioGridNode = BioGridNode( + subject_rid, + subject_symbol, + subject_uniprot, + subject_taxonomy_id, + self.modConfig.pmod_bel, + ) + self.obj: BioGridNode = BioGridNode( + object_rid, + object_symbol, + object_uniprot, + object_taxonomy_id, + self.modConfig.pmod_bel, + ) self.experimental_system: str = experimental_system self.pmids: str = pmids @@ -107,30 +138,33 @@ def both_pure_proteins_exists(self): def get_edge_value_dict(self): """Get edge value dictionary.""" - value_dict = {'modification': self.modification} + value_dict = {"modification": self.modification} if self.dois: - value_dict['dois'] = self.dois.split(',') + value_dict["dois"] = self.dois.split(",") if self.pmids: - value_dict['pmids'] = [int(x) for x in self.pmids.split(',')] + value_dict["pmids"] = [int(x) for x in self.pmids.split(",")] if self.biogrid_ids: - value_dict['biogrid_ids'] = [int(x) for x in self.biogrid_ids.split(',')] + value_dict["biogrid_ids"] = [int(x) for x in self.biogrid_ids.split(",")] return value_dict def get_pmod_as_value_dict(self): """Get pmod metadata as dictionary.""" - return {'bel': f'pmod({self.modConfig.pmod_bel})', 'type': normalized_pmod_reverse[self.modConfig.pmod_bel]} + return { + "bel": f"pmod({self.modConfig.pmod_bel})", + "type": normalized_pmod_reverse[self.modConfig.pmod_bel], + } @property def edge_name(self): """Get edge name.""" - return self.modConfig.effect + '_' + normalized_pmod_reverse[self.modConfig.pmod_bel] + '_bg' + return self.modConfig.effect + "_" + normalized_pmod_reverse[self.modConfig.pmod_bel] + "_bg" class Effect: """Class definition for Effect.""" - INCREASES = 'increases' - DECREASES = 'decreases' + INCREASES = "increases" + DECREASES = "decreases" class ModConfig: @@ -156,25 +190,27 @@ def __init__(self, bg_mod_name: str, effect: str, pmod_bel: str = None): class Modification(Enum): """BioGrid modification and configuration for BEL converting.""" - PHOSPHORYLATION: ModConfig = ModConfig('Phosphorylation', Effect.INCREASES, BelPmod.PHO) - UBIQUITINATION: ModConfig = ModConfig('Ubiquitination', Effect.INCREASES, BelPmod.UBI) - ACETYLATION: ModConfig = ModConfig('Acetylation', Effect.INCREASES, BelPmod.ACE) - DEUBIQUITINATION: ModConfig = ModConfig('Deubiquitination', Effect.DECREASES, BelPmod.UBI) - PROTEOLYTIC_PROCESSING: ModConfig = ModConfig('Proteolytic Processing', Effect.DECREASES) - METHYLATION: ModConfig = ModConfig('Methylation', Effect.INCREASES, BelPmod.ME0) - SUMOYLATION: ModConfig = ModConfig('Sumoylation', Effect.INCREASES, BelPmod.SUM) - DEPHOSPHORYLATION: ModConfig = ModConfig('Dephosphorylation', Effect.DECREASES, BelPmod.PHO) - DEACETYLATION: ModConfig = ModConfig('Deacetylation', Effect.DECREASES, BelPmod.ACE) - NEDD_RUB1_YLATION: ModConfig = ModConfig('Nedd(Rub1)ylation', Effect.INCREASES, BelPmod.NED) - RIBOSYLATION: ModConfig = ModConfig('Ribosylation', Effect.INCREASES, BelPmod.ADD) - DESUMOYLATION: ModConfig = ModConfig('Desumoylation', Effect.DECREASES, BelPmod.SUM) - DEMETHYLATION: ModConfig = ModConfig('Demethylation', Effect.DECREASES, BelPmod.ME0) - DENEDDYLATION: ModConfig = ModConfig('Deneddylation', Effect.DECREASES, BelPmod.NED) - PRENYLATION: ModConfig = ModConfig('Prenylation', Effect.INCREASES, BelPmod.PRE) - GLYCOSYLATION: ModConfig = ModConfig('Glycosylation', Effect.INCREASES, BelPmod.GLY) - NEDDYLATION: ModConfig = ModConfig('Neddylation', Effect.INCREASES, BelPmod.NED) - DE_ISGYLATION: ModConfig = ModConfig('de-ISGylation', Effect.INCREASES, BelPmod.DEI) - FAT10YLATION: ModConfig = ModConfig('FAT10ylation', Effect.INCREASES, BelPmod.FAT) + PHOSPHORYLATION: ModConfig = ModConfig("Phosphorylation", Effect.INCREASES, BelPmod.PHO) + UBIQUITINATION: ModConfig = ModConfig("Ubiquitination", Effect.INCREASES, BelPmod.UBI) + ACETYLATION: ModConfig = ModConfig("Acetylation", Effect.INCREASES, BelPmod.ACE) + DEUBIQUITINATION: ModConfig = ModConfig("Deubiquitination", Effect.DECREASES, BelPmod.UBI) + PROTEOLYTIC_PROCESSING: ModConfig = ModConfig("Proteolytic Processing", Effect.DECREASES) + METHYLATION: ModConfig = ModConfig("Methylation", Effect.INCREASES, BelPmod.ME0) + SUMOYLATION: ModConfig = ModConfig("Sumoylation", Effect.INCREASES, BelPmod.SUM) + DEPHOSPHORYLATION: ModConfig = ModConfig("Dephosphorylation", Effect.DECREASES, BelPmod.PHO) + DEACETYLATION: ModConfig = ModConfig("Deacetylation", Effect.DECREASES, BelPmod.ACE) + NEDD_RUB1_YLATION: ModConfig = ModConfig("Nedd(Rub1)ylation", Effect.INCREASES, BelPmod.NED) + RIBOSYLATION: ModConfig = ModConfig("Ribosylation", Effect.INCREASES, BelPmod.ADD) + DESUMOYLATION: ModConfig = ModConfig("Desumoylation", Effect.DECREASES, BelPmod.SUM) + DEMETHYLATION: ModConfig = ModConfig("Demethylation", Effect.DECREASES, BelPmod.ME0) + DENEDDYLATION: ModConfig = ModConfig("Deneddylation", Effect.DECREASES, BelPmod.NED) + PRENYLATION: ModConfig = ModConfig("Prenylation", Effect.INCREASES, BelPmod.PRE) + GLYCOSYLATION: ModConfig = ModConfig("Glycosylation", Effect.INCREASES, BelPmod.GLY) + NEDDYLATION: ModConfig = ModConfig("Neddylation", Effect.INCREASES, BelPmod.NED) + DE_ISGYLATION: ModConfig = ModConfig("de-ISGylation", Effect.INCREASES, BelPmod.DEI) + FAT10YLATION: ModConfig = ModConfig("FAT10ylation", Effect.INCREASES, BelPmod.FAT) + UFMYLATION: ModConfig = ModConfig("Ufmylation", Effect.INCREASES, BelPmod.UFM) + ISGYLATION: ModConfig = ModConfig("ISGylation", Effect.INCREASES, BelPmod.ISG) @classmethod def get_reverse_dict(cls) -> typing.Dict[str, ModConfig]: @@ -194,17 +230,19 @@ def __init__(self, client: OrientDB = None): self.biodb_name = BIOGRID self.url = urls.BIOGRID self.urls = {self.biodb_name: self.url} - super().__init__(edges=odb_structure.biogrid_edges, - urls=self.urls, - tables_base=biogrid.Base, - biodb_name=self.biodb_name) + super().__init__( + edges=odb_structure.biogrid_edges, + urls=self.urls, + tables_base=biogrid.Base, + biodb_name=self.biodb_name, + ) self.file_path = tools.get_file_path(urls.BIOGRID, self.biodb_name) self.bel_rid_cache = {} def __len__(self) -> int: """Get number of 'biogrid_interaction' graph edges.""" - return self.number_of_edges['biogrid_interaction'] + return self.number_of_edges["biogrid_interaction"] def __contains__(self, biogrid_id) -> bool: """Check if biogrid_interaction edge with biogrid_id exists in graph.""" @@ -214,37 +252,39 @@ def __contains__(self, biogrid_id) -> bool: def insert_data(self) -> Dict[str, int]: """Insert BioGRID data into database.""" - use_columns = {'#BioGRID Interaction ID': 'biogrid_id', - 'BioGRID ID Interactor A': 'biogrid_a_id', - 'BioGRID ID Interactor B': 'biogrid_b_id', - 'Entrez Gene Interactor A': 'entrez_a', - 'Entrez Gene Interactor B': 'entrez_b', - 'Systematic Name Interactor A': 'systematic_name_a', - 'Systematic Name Interactor B': 'systematic_name_b', - 'Official Symbol Interactor A': 'symbol_a', - 'Official Symbol Interactor B': 'symbol_b', - 'Experimental System': 'experimental_system', - 'Experimental System Type': 'experimental_system_type', - 'Author': 'author', - 'Publication Source': 'publication_source', - 'Organism ID Interactor A': 'taxonomy_a_id', - 'Organism ID Interactor B': 'taxonomy_b_id', - 'Throughput': 'throughput', - 'Score': 'score', - 'Modification': 'modification', - 'Qualifications': 'qualification', - 'Source Database': 'source', - 'SWISS-PROT Accessions Interactor A': 'uniprot_a', - 'TREMBL Accessions Interactor A': 'trembl_a', - 'SWISS-PROT Accessions Interactor B': 'uniprot_b', - 'TREMBL Accessions Interactor B': 'trembl_b', - 'Organism Name Interactor A': 'org_a', - 'Organism Name Interactor B': 'org_b'} + use_columns = { + "#BioGRID Interaction ID": "biogrid_id", + "BioGRID ID Interactor A": "biogrid_a_id", + "BioGRID ID Interactor B": "biogrid_b_id", + "Entrez Gene Interactor A": "entrez_a", + "Entrez Gene Interactor B": "entrez_b", + "Systematic Name Interactor A": "systematic_name_a", + "Systematic Name Interactor B": "systematic_name_b", + "Official Symbol Interactor A": "symbol_a", + "Official Symbol Interactor B": "symbol_b", + "Experimental System": "experimental_system", + "Experimental System Type": "experimental_system_type", + "Author": "author", + "Publication Source": "publication_source", + "Organism ID Interactor A": "taxonomy_a_id", + "Organism ID Interactor B": "taxonomy_b_id", + "Throughput": "throughput", + "Score": "score", + "Modification": "modification", + "Qualifications": "qualification", + "Source Database": "source", + "SWISS-PROT Accessions Interactor A": "uniprot_a", + "TREMBL Accessions Interactor A": "trembl_a", + "SWISS-PROT Accessions Interactor B": "uniprot_b", + "TREMBL Accessions Interactor B": "trembl_b", + "Organism Name Interactor A": "org_a", + "Organism Name Interactor B": "org_b", + } # main table df = pd.read_csv(self.file_path, usecols=use_columns.keys(), sep="\t", low_memory=False) df.rename(columns=use_columns, inplace=True) - df.replace('-', np.nan, inplace=True) + df.replace("-", np.nan, inplace=True) # experimental system df = self._create_experimental_system_table(df) @@ -269,108 +309,146 @@ def insert_data(self) -> Dict[str, int]: # save main df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) df.to_sql(biogrid.Biogrid.__tablename__, self.engine, if_exists="append") return {self.biodb_name: df.shape[0]} def _create_publication_table(self, df: pd.DataFrame) -> pd.DataFrame: - df_ay = df.author.str.extract(r'^(?P[^(]+)\s*\((?P\d+)\)$') - df_source = df.publication_source.str.extract(r'^(?P[^:]+):(?P.*)') + df_ay = df.author.str.extract(r"^(?P[^(]+)\s*\((?P\d+)\)$") + df_source = df.publication_source.str.extract(r"^(?P[^:]+):(?P.*)") df_pub = pd.concat([df_ay, df_source, df.publication_source], axis=1) df_pub.drop_duplicates(inplace=True) df_pub.reset_index(inplace=True) df_pub.index += 1 - df_pub.index.rename('id', inplace=True) - df_pub['publication_id'] = df_pub.index - df_pub_4join = df_pub.set_index('publication_source')[['publication_id']] - df = df_pub_4join.join(df.set_index('publication_source'), how="right").reset_index().drop( - columns=['publication_source', 'author']) - df_pub.drop(columns=['publication_source', 'index', 'publication_id'], inplace=True) - df_pub.to_sql(biogrid.Publication.__tablename__, self.engine, if_exists='append') + df_pub.index.rename("id", inplace=True) + df_pub["publication_id"] = df_pub.index + df_pub_4join = df_pub.set_index("publication_source")[["publication_id"]] + df = ( + df_pub_4join.join(df.set_index("publication_source"), how="right") + .reset_index() + .drop(columns=["publication_source", "author"]) + ) + df_pub.drop(columns=["publication_source", "index", "publication_id"], inplace=True) + df_pub.to_sql(biogrid.Publication.__tablename__, self.engine, if_exists="append") return df def _create_modification_table(self, df: pd.DataFrame) -> pd.DataFrame: - df_mod = df[['modification']].dropna().value_counts().reset_index().rename(columns={0: 'frequency'}) + df_mod = df[["modification"]].dropna().value_counts().reset_index().rename(columns={0: "frequency"}) df_mod.index += 1 - df_mod.index.rename('id', inplace=True) - df_mod['modification_id'] = df_mod.index - df = df_mod.set_index('modification')[['modification_id']].join(df.set_index('modification'), - how="right").reset_index().drop( - columns=['modification']) - df_mod.drop(columns=['modification_id']).to_sql(biogrid.Modification.__tablename__, self.engine, - if_exists='append') + df_mod.index.rename("id", inplace=True) + df_mod["modification_id"] = df_mod.index + df = ( + df_mod.set_index("modification")[["modification_id"]] + .join(df.set_index("modification"), how="right") + .reset_index() + .drop(columns=["modification"]) + ) + df_mod.drop(columns=["modification_id"]).to_sql( + biogrid.Modification.__tablename__, self.engine, if_exists="append" + ) return df def _create_throughput_table(self, df: pd.DataFrame) -> pd.DataFrame: - df_tp = df[['throughput']].value_counts().reset_index().rename(columns={0: 'frequency'}) + df_tp = df[["throughput"]].value_counts().reset_index().rename(columns={0: "frequency"}) df_tp.index += 1 - df_tp.index.rename('id', inplace=True) - df_tp['throughput_id'] = df_tp.index - df_tp_join = df_tp.set_index('throughput')[['throughput_id']] - df = df_tp_join.join(df.set_index('throughput'), how="right").reset_index().drop(columns=['throughput']) - df_tp.drop(columns=['throughput_id'], inplace=True) - df_tp.to_sql(biogrid.Throughput.__tablename__, self.engine, if_exists='append') + df_tp.index.rename("id", inplace=True) + df_tp["throughput_id"] = df_tp.index + df_tp_join = df_tp.set_index("throughput")[["throughput_id"]] + df = df_tp_join.join(df.set_index("throughput"), how="right").reset_index().drop(columns=["throughput"]) + df_tp.drop(columns=["throughput_id"], inplace=True) + df_tp.to_sql(biogrid.Throughput.__tablename__, self.engine, if_exists="append") return df def _create_interactor_table(self, df: pd.DataFrame) -> pd.DataFrame: - columns = ['entrez', 'biogrid_id', 'systematic_name', - 'symbol', 'uniprot', 'trembl', 'taxonomy_id'] - cols_a = ['entrez_a', 'biogrid_a_id', 'systematic_name_a', - 'symbol_a', 'uniprot_a', 'trembl_a', 'taxonomy_a_id'] - cols_b = ['entrez_b', 'biogrid_b_id', 'systematic_name_b', - 'symbol_b', 'uniprot_b', 'trembl_b', 'taxonomy_b_id'] + columns = [ + "entrez", + "biogrid_id", + "systematic_name", + "symbol", + "uniprot", + "trembl", + "taxonomy_id", + ] + cols_a = [ + "entrez_a", + "biogrid_a_id", + "systematic_name_a", + "symbol_a", + "uniprot_a", + "trembl_a", + "taxonomy_a_id", + ] + cols_b = [ + "entrez_b", + "biogrid_b_id", + "systematic_name_b", + "symbol_b", + "uniprot_b", + "trembl_b", + "taxonomy_b_id", + ] df_a = df[cols_a] df_a.columns = columns df_b = df[cols_b] df_b.columns = columns - df_ia = pd.concat([df_a.set_index('biogrid_id'), df_b.set_index('biogrid_id')]).drop_duplicates() + df_ia = pd.concat([df_a.set_index("biogrid_id"), df_b.set_index("biogrid_id")]).drop_duplicates() # extract the first accession - df_ia.uniprot = df_ia.uniprot.str.split('|').str[0] - df_ia.trembl = df_ia.trembl.str.split('|').str[0] - df_ia.replace('-', None).to_sql(biogrid.Interactor.__tablename__, self.engine, if_exists='append') - cols4delete = list(set(cols_a + cols_b) - {'biogrid_a_id', 'biogrid_b_id'}) + df_ia.uniprot = df_ia.uniprot.str.split("|").str[0] + df_ia.trembl = df_ia.trembl.str.split("|").str[0] + df_ia.replace("-", None).to_sql(biogrid.Interactor.__tablename__, self.engine, if_exists="append") + cols4delete = list(set(cols_a + cols_b) - {"biogrid_a_id", "biogrid_b_id"}) df.drop(columns=cols4delete, inplace=True) return df def _create_experimental_system_table(self, df: pd.DataFrame) -> pd.DataFrame: # extract ExperimentalSystem - df_exp = df[['experimental_system', 'experimental_system_type']].value_counts().reset_index().rename( - columns={0: 'frequency'}) + df_exp = ( + df[["experimental_system", "experimental_system_type"]] + .value_counts() + .reset_index() + .rename(columns={0: "frequency"}) + ) df_exp.index += 1 - df_exp.index.rename('id', inplace=True) - df_exp['experimental_system_id'] = df_exp.index + df_exp.index.rename("id", inplace=True) + df_exp["experimental_system_id"] = df_exp.index # join ExperimentalSystem ID to main table and drop cols experimental_system, experimental_system_type - df = df_exp.set_index(['experimental_system', 'experimental_system_type'])[['experimental_system_id']].join( - df.set_index(['experimental_system', 'experimental_system_type'])).reset_index().drop( - columns=['experimental_system', 'experimental_system_type']) + df = ( + df_exp.set_index(["experimental_system", "experimental_system_type"])[["experimental_system_id"]] + .join(df.set_index(["experimental_system", "experimental_system_type"])) + .reset_index() + .drop(columns=["experimental_system", "experimental_system_type"]) + ) # save ExperimentalSystem - df_exp.drop(columns=['experimental_system_id']).to_sql(biogrid.ExperimentalSystem.__tablename__, self.engine, - if_exists='append') + df_exp.drop(columns=["experimental_system_id"]).to_sql( + biogrid.ExperimentalSystem.__tablename__, self.engine, if_exists="append" + ) return df def _create_taxonomy_table(self, df: pd.DataFrame): - df_org_a = df[['taxonomy_a_id', 'org_a']].rename( - columns={'taxonomy_a_id': 'taxonomy_id', 'org_a': 'organism_name'}) - df_org_b = df[['taxonomy_b_id', 'org_b']].rename( - columns={'taxonomy_b_id': 'taxonomy_id', 'org_b': 'organism_name'}) + df_org_a = df[["taxonomy_a_id", "org_a"]].rename( + columns={"taxonomy_a_id": "taxonomy_id", "org_a": "organism_name"} + ) + df_org_b = df[["taxonomy_b_id", "org_b"]].rename( + columns={"taxonomy_b_id": "taxonomy_id", "org_b": "organism_name"} + ) df_taxonomy = pd.concat([df_org_a, df_org_b]) df_taxonomy.drop_duplicates(inplace=True) df_taxonomy.reset_index(inplace=True) - df_taxonomy.drop(columns=['index'], inplace=True) - df_taxonomy.set_index('taxonomy_id', inplace=True) - df_taxonomy.to_sql(biogrid.Taxonomy.__tablename__, self.engine, if_exists='append') - df.drop(columns=['org_a', 'org_b'], inplace=True) + df_taxonomy.drop(columns=["index"], inplace=True) + df_taxonomy.set_index("taxonomy_id", inplace=True) + df_taxonomy.to_sql(biogrid.Taxonomy.__tablename__, self.engine, if_exists="append") + df.drop(columns=["org_a", "org_b"], inplace=True) def _create_source_table(self, df: pd.DataFrame) -> pd.DataFrame: - df_source = df[['source']].drop_duplicates().reset_index().drop(columns=['index']) + df_source = df[["source"]].drop_duplicates().reset_index().drop(columns=["index"]) df_source.index += 1 - df_source.index.rename('id', inplace=True) - df_source['source_id'] = df_source.index - df = df_source.set_index(['source']).join(df.set_index(['source'])).reset_index().drop(columns=['source']) - df_source.drop(columns=['source_id']).to_sql(biogrid.Source.__tablename__, self.engine, if_exists='append') + df_source.index.rename("id", inplace=True) + df_source["source_id"] = df_source.index + df = df_source.set_index(["source"]).join(df.set_index(["source"])).reset_index().drop(columns=["source"]) + df_source.drop(columns=["source_id"]).to_sql(biogrid.Source.__tablename__, self.engine, if_exists="append") return df def get_uniprot_modification_pairs(self): @@ -401,15 +479,20 @@ def get_uniprot_modification_pairs(self): def get_create_pure_protein_rid_by_uniprot(self, taxonomy_id, symbol, uniprot): """Get pure protein rid by UniProt accession ID if the protein is involved in a BEL statement.""" - namespace = STANDARD_NAMESPACES.get(taxonomy_id, 'UNIPROT') - name = uniprot if namespace == 'UNIPROT' else symbol + namespace = STANDARD_NAMESPACES.get(taxonomy_id, "UNIPROT") + name = uniprot if namespace == "UNIPROT" else symbol bel = f'p({namespace}:"{name}")' if bel in self.bel_rid_cache: rid = self.bel_rid_cache[bel] else: - value_dict = {'bel': bel, 'pure': True, 'namespace': namespace, 'name': name} - rid = self.get_create_rid('protein', value_dict=value_dict, check_for='bel') + value_dict = { + "bel": bel, + "pure": True, + "namespace": namespace, + "name": name, + } + rid = self.get_create_rid("protein", value_dict=value_dict, check_for="bel") self.bel_rid_cache[bel] = rid return rid @@ -456,42 +539,55 @@ def update_interactions(self) -> int: counter = 0 self.clear_edges() - for e in tqdm(uniprot_modification_pairs, desc=f"Update {self.biodb_name.upper()} interactions"): - if e['subject_uniprot'] in uniprots_in_bel_set or e['object_uniprot'] in uniprots_in_bel_set: + for e in tqdm( + uniprot_modification_pairs, + desc=f"Update {self.biodb_name.upper()} interactions", + ): + if e["subject_uniprot"] in uniprots_in_bel_set or e["object_uniprot"] in uniprots_in_bel_set: subj_pure_rid = self.get_create_pure_protein_rid_by_uniprot( - taxonomy_id=e['subject_taxonomy_id'], - symbol=e['subject_symbol'], - uniprot=e['subject_uniprot'] + taxonomy_id=e["subject_taxonomy_id"], + symbol=e["subject_symbol"], + uniprot=e["subject_uniprot"], ) obj_pure_rid = self.get_create_pure_protein_rid_by_uniprot( - taxonomy_id=e['object_taxonomy_id'], - symbol=e['object_symbol'], - uniprot=e['object_uniprot'] + taxonomy_id=e["object_taxonomy_id"], + symbol=e["object_symbol"], + uniprot=e["object_uniprot"], ) - sql = sql_temp.format(subject_uniprot=e['subject_uniprot'], object_uniprot=e['object_uniprot']) + sql = sql_temp.format( + subject_uniprot=e["subject_uniprot"], + object_uniprot=e["object_uniprot"], + ) for row in self.engine.execute(sql).fetchall(): row_dict = dict(row) be = BioGridEdge(subject_rid=subj_pure_rid, object_rid=obj_pure_rid, **row_dict) edge_value_dict = be.get_edge_value_dict() - if be.modConfig.bg_mod_name == 'Proteolytic Processing': - self.create_edge('decreases_bg', - from_rid=subj_pure_rid, - to_rid=obj_pure_rid, - value_dict=edge_value_dict) + if be.modConfig.bg_mod_name == "Proteolytic Processing": + self.create_edge( + "decreases_bg", + from_rid=subj_pure_rid, + to_rid=obj_pure_rid, + value_dict=edge_value_dict, + ) counter += 1 else: obj_pmod_value_dict = be.obj.get_pmod_protein_as_value_dict() - pmod_protein_rid = self.node_exists('protein', obj_pmod_value_dict, check_for='bel') + pmod_protein_rid = self.node_exists("protein", obj_pmod_value_dict, check_for="bel") if not pmod_protein_rid: - pmod_protein_rid = self.get_create_rid('protein', obj_pmod_value_dict, check_for='bel') - self.create_edge('has_modified_protein', obj_pure_rid, pmod_protein_rid) - pmod_rid = self.insert_record('pmod', be.get_pmod_as_value_dict()) - self.create_edge('has__pmod', pmod_protein_rid, pmod_rid) - self.create_edge(be.edge_name, subj_pure_rid, pmod_protein_rid, edge_value_dict) + pmod_protein_rid = self.get_create_rid("protein", obj_pmod_value_dict, check_for="bel") + self.create_edge("has_modified_protein", obj_pure_rid, pmod_protein_rid) + pmod_rid = self.insert_record("pmod", be.get_pmod_as_value_dict()) + self.create_edge("has__pmod", pmod_protein_rid, pmod_rid) + self.create_edge( + be.edge_name, + subj_pure_rid, + pmod_protein_rid, + edge_value_dict, + ) counter += 1 return counter diff --git a/ebel/manager/orientdb/biodbs/chebi.py b/ebel/manager/orientdb/biodbs/chebi.py index 159df39..3ace1cb 100644 --- a/ebel/manager/orientdb/biodbs/chebi.py +++ b/ebel/manager/orientdb/biodbs/chebi.py @@ -22,24 +22,24 @@ def __init__(self, client: OrientDB = None): """Init CHEBI.""" self.client = client self.biodb_name = CHEBI - self.urls = {chebi.Compound.__tablename__: urls.CHEBI_COMPOUND, - chebi.Relation.__tablename__: urls.CHEBI_RELATION, - chebi.Inchi.__tablename__: urls.CHEBI_INCHI, - chebi.ChemicalData.__tablename__: urls.CHEBI_CHEMICALDATA, - chebi.Comment.__tablename__: urls.CHEBI_COMMENT, - chebi.DatabaseAccession.__tablename__: urls.CHEBI_DATABASEACCESSION, - chebi.Name.__tablename__: urls.CHEBI_NAME, - chebi.Reference.__tablename__: urls.CHEBI_REFERENCE, - chebi.Structure.__tablename__: urls.CHEBI_STRUCTURE} - - super().__init__(urls=self.urls, - biodb_name=self.biodb_name, - tables_base=chebi.Base) + self.urls = { + chebi.Compound.__tablename__: urls.CHEBI_COMPOUND, + chebi.Relation.__tablename__: urls.CHEBI_RELATION, + chebi.Inchi.__tablename__: urls.CHEBI_INCHI, + chebi.ChemicalData.__tablename__: urls.CHEBI_CHEMICALDATA, + chebi.Comment.__tablename__: urls.CHEBI_COMMENT, + chebi.DatabaseAccession.__tablename__: urls.CHEBI_DATABASEACCESSION, + chebi.Name.__tablename__: urls.CHEBI_NAME, + chebi.Reference.__tablename__: urls.CHEBI_REFERENCE, + chebi.Structure.__tablename__: urls.CHEBI_STRUCTURE, + } + + super().__init__(urls=self.urls, biodb_name=self.biodb_name, tables_base=chebi.Base) def __len__(self) -> int: """Get number of edges in OrientDB.""" sql = "Select count(*) from bel where namespace='CHEBI'" - return self.execute(sql)[0].oRecordData['count'] + return self.execute(sql)[0].oRecordData["count"] def __contains__(self, name: str) -> bool: """Checks if RS number (without prefix RS) exists in BEL graph.""" @@ -48,23 +48,28 @@ def __contains__(self, name: str) -> bool: def insert_data(self) -> Dict[str, int]: """Insert data in generic OrientDB class.""" - logger.info(f'Insert data in {self.biodb_name}') + logger.info(f"Insert data in {self.biodb_name}") inserted = {} file_path_compound = tools.get_file_path(urls.CHEBI_COMPOUND, self.biodb_name) - df_compounds = pd.read_csv(file_path_compound, sep="\t", low_memory=False, on_bad_lines='skip') - df_compound_ids = df_compounds[['ID']] + df_compounds = pd.read_csv(file_path_compound, sep="\t", low_memory=False, on_bad_lines="skip") + df_compound_ids = df_compounds[["ID"]] del df_compounds - for table_name, url in tqdm(self.urls.items(), desc=f'Import {self.biodb_name.upper()}'): - + for table_name, url in tqdm(self.urls.items(), desc=f"Import {self.biodb_name.upper()}"): file_path = tools.get_file_path(url, self.biodb_name) - seperator = "\t" if re.search(r'.*\.tsv(\.gz)?$', file_path) else "," - encoding = "ISO-8859-1" if table_name == 'chebi_reference' else None - - dfs = pd.read_csv(file_path, sep=seperator, encoding=encoding, low_memory=False, on_bad_lines='skip', - chunksize=100000) + seperator = "\t" if re.search(r".*\.tsv(\.gz)?$", file_path) else "," + encoding = "ISO-8859-1" if table_name == "chebi_reference" else None + + dfs = pd.read_csv( + file_path, + sep=seperator, + encoding=encoding, + low_memory=False, + on_bad_lines="skip", + chunksize=100000, + ) inserted[table_name] = 0 for df in dfs: df.columns = df.columns.str.lower() @@ -72,22 +77,34 @@ def insert_data(self) -> Dict[str, int]: if table_name == chebi.Inchi.__tablename__: # with_index = True df.index += 1 - df.index.rename('id', inplace=True) - df.rename(columns={'chebi_id': "compound_id"}, inplace=True) - - if 'compound_id' in df.columns: - df = df_compound_ids.rename(columns={'ID': 'compound_id'}).set_index('compound_id').join( - df.set_index('compound_id'), how='inner').reset_index() - - if 'init_id' in df.columns: - df = df_compound_ids.rename(columns={'ID': 'init_id'}).set_index('init_id').join( - df.set_index('init_id'), how='inner').reset_index() - - if 'final_id' in df.columns: - df = df_compound_ids.rename(columns={'ID': 'final_id'}).set_index('final_id').join( - df.set_index('final_id'), how='inner').reset_index() - - df.to_sql(table_name, self.engine, index=False, if_exists='append') + df.index.rename("id", inplace=True) + df.rename(columns={"chebi_id": "compound_id"}, inplace=True) + + if "compound_id" in df.columns: + df = ( + df_compound_ids.rename(columns={"ID": "compound_id"}) + .set_index("compound_id") + .join(df.set_index("compound_id"), how="inner") + .reset_index() + ) + + if "init_id" in df.columns: + df = ( + df_compound_ids.rename(columns={"ID": "init_id"}) + .set_index("init_id") + .join(df.set_index("init_id"), how="inner") + .reset_index() + ) + + if "final_id" in df.columns: + df = ( + df_compound_ids.rename(columns={"ID": "final_id"}) + .set_index("final_id") + .join(df.set_index("final_id"), how="inner") + .reset_index() + ) + + df.to_sql(table_name, self.engine, index=False, if_exists="append") inserted[table_name] += df.shape[0] self.session.commit() @@ -97,17 +114,20 @@ def update_bel(self) -> int: """Update the BEL edges with CHEBI metadata.""" updated = 0 sql = "Update {rid} set chebi = {chebi_id}" - chebi_nodes = self.query_class(class_name='bio_object', - columns=['name', '@class'], - namespace='CHEBI', - pure=True) - - for chebi_node in tqdm(chebi_nodes, desc='Update ChEBI identifier in BEL'): - chebi_compound = self.session.query(chebi.Compound.id) \ - .filter(chebi.Compound.name == chebi_node['name']).first() + chebi_nodes = self.query_class( + class_name="bio_object", + columns=["name", "@class"], + namespace="CHEBI", + pure=True, + ) + + for chebi_node in tqdm(chebi_nodes, desc="Update ChEBI identifier in BEL"): + chebi_compound = ( + self.session.query(chebi.Compound.id).filter(chebi.Compound.name == chebi_node["name"]).first() + ) if chebi_compound: - updated += self.execute(sql.format(rid=chebi_node['rid'], chebi_id=chebi_compound[0]))[0] + updated += self.execute(sql.format(rid=chebi_node["rid"], chebi_id=chebi_compound[0]))[0] return updated diff --git a/ebel/manager/orientdb/biodbs/clinical_trials.py b/ebel/manager/orientdb/biodbs/clinical_trials.py index 3fe1fb5..008c567 100644 --- a/ebel/manager/orientdb/biodbs/clinical_trials.py +++ b/ebel/manager/orientdb/biodbs/clinical_trials.py @@ -1,25 +1,21 @@ """Clinical Trials.""" import logging +from collections import namedtuple +from typing import Dict, Optional +from zipfile import ZipFile import pandas as pd - -from tqdm import tqdm from lxml import etree -from zipfile import ZipFile from pyorientdb import OrientDB -from typing import Dict, Optional -from collections import namedtuple - +from tqdm import tqdm from ebel import tools from ebel.constants import RID from ebel.manager.orientdb import odb_meta, urls from ebel.manager.orientdb.constants import CLINICAL_TRIALS - from ebel.manager.rdbms.models import clinical_trials_gov - -Intervention = namedtuple('Intervention', ['intervention_type', 'intervention_name']) +Intervention = namedtuple("Intervention", ["intervention_type", "intervention_name"]) logger = logging.getLogger(__name__) @@ -35,10 +31,12 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): self.url = urls.CLINICAL_TRIALS_GOV self.urls = {self.biodb_name: self.url} self.file_path = tools.get_file_path(urls.CLINICAL_TRIALS_GOV, self.biodb_name) - super().__init__(tables_base=clinical_trials_gov.Base, - # nodes=odb_structure.clinical_trials_gov_nodes, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + tables_base=clinical_trials_gov.Base, + # nodes=odb_structure.clinical_trials_gov_nodes, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -53,13 +51,13 @@ def add_link_to_drugbank(self, data_dict: dict, trial_rid: str): # update_sql = 'UPDATE drugbank ADD clinical_trials = {} WHERE name = "{}" OR "{}" in synonyms' # TODO index drugbank.synonyms - results = self.query_class(class_name='drugbank', limit=1) # Check if drugbank is there + results = self.query_class(class_name="drugbank", limit=1) # Check if drugbank is there if results: update_sql = 'UPDATE drugbank ADD clinical_trials = {} WHERE name = "{}"' - if 'drugs_in_trial' in data_dict.keys(): - for drug in data_dict['drugs_in_trial']: + if "drugs_in_trial" in data_dict.keys(): + for drug in data_dict["drugs_in_trial"]: drug = drug.replace('"', "'") # Have to scrub the string # self.execute(update_sql.format(trial_rid, drug, drug)) self.execute(update_sql.format(trial_rid, drug)) @@ -67,52 +65,52 @@ def add_link_to_drugbank(self, data_dict: dict, trial_rid: str): def insert_n2m_tables(self): """Inserts mesh_terms and keywords.""" with ZipFile(self.file_path, "r") as zip_file: - xml_files = [x for x in zip_file.filelist if x.filename.endswith('.xml')] - - d = {'keyword': set(), 'mesh_term': set(), 'condition': set(), 'intervention': set()} + xml_files = [x for x in zip_file.filelist if x.filename.endswith(".xml")] + + d = { + "keyword": set(), + "mesh_term": set(), + "condition": set(), + "intervention": set(), + } interventions = set() for f in tqdm(xml_files, desc="Get and set unique values tables in ClinicalTrials.gov"): - with zip_file.open(f.filename) as xml_file: xml_content = xml_file.read() doc = etree.fromstring(xml_content) for child in doc: - if child.tag in ['keyword', 'condition']: + if child.tag in ["keyword", "condition"]: d[child.tag].add(child.text) - elif child.tag == 'condition_browse': - d['mesh_term'].update(set(child.xpath('./mesh_term//text()'))) + elif child.tag == "condition_browse": + d["mesh_term"].update(set(child.xpath("./mesh_term//text()"))) - elif child.tag == 'intervention': + elif child.tag == "intervention": itype, iname = self.__parse_intervention(child) interventions.add( Intervention( intervention_type=(itype.strip() if itype else itype), - intervention_name=(iname.strip() if iname else iname) + intervention_name=(iname.strip() if iname else iname), ) ) df_interventions = pd.DataFrame(interventions) df_interventions.index += 1 - df_interventions.index.rename('id', inplace=True) + df_interventions.index.rename("id", inplace=True) df_interventions.to_sql( clinical_trials_gov.Intervention.__tablename__, self.engine, - if_exists="append" + if_exists="append", ) for column_name, data in d.items(): df = pd.DataFrame(data, columns=[column_name]) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql( - 'clinical_trials_gov_' + column_name, - self.engine, - if_exists='append' - ) + df.index.rename("id", inplace=True) + df.to_sql("clinical_trials_gov_" + column_name, self.engine, if_exists="append") @staticmethod def __parse_intervention(interventions) -> tuple: @@ -122,8 +120,8 @@ def __parse_intervention(interventions) -> tuple: for cchild in interventions: intervention[cchild.tag] = cchild.text.strip() - itype = intervention.get('intervention_type') - iname = intervention.get('intervention_name') + itype = intervention.get("intervention_type") + iname = intervention.get("intervention_name") return itype, iname @@ -134,66 +132,77 @@ def __get_first(element: list) -> Optional[str]: def insert_mesh_terms(self, df): """Insert mesh_terms into database.""" - columns = {'id': 'clinical_trials_gov_id', - 'mesh_terms': 'clinical_trials_gov_mesh_term_id'} + columns = { + "id": "clinical_trials_gov_id", + "mesh_terms": "clinical_trials_gov_mesh_term_id", + } table = clinical_trials_gov.ctg_mesh_term_n2m.name - df[['mesh_terms', 'id']] \ - .explode('mesh_terms') \ - .rename(columns=columns).to_sql(table, index=False, if_exists='append', con=self.engine) + df[["mesh_terms", "id"]].explode("mesh_terms").rename(columns=columns).to_sql( + table, index=False, if_exists="append", con=self.engine + ) def insert_keywords(self, df): """Insert keywords into database.""" - columns = {'id': 'clinical_trials_gov_id', - 'keywords': 'clinical_trials_gov_keyword_id'} + columns = { + "id": "clinical_trials_gov_id", + "keywords": "clinical_trials_gov_keyword_id", + } table = clinical_trials_gov.ctg_keyword_n2m.name - df[['keywords', 'id']] \ - .explode('keywords') \ - .rename(columns=columns).to_sql(table, index=False, if_exists='append', con=self.engine) + df[["keywords", "id"]].explode("keywords").rename(columns=columns).to_sql( + table, index=False, if_exists="append", con=self.engine + ) def insert_conditions(self, df): """Insert conditions into database.""" - columns = {'id': 'clinical_trials_gov_id', - 'conditions': 'clinical_trials_gov_condition_id'} + columns = { + "id": "clinical_trials_gov_id", + "conditions": "clinical_trials_gov_condition_id", + } table = clinical_trials_gov.ctg_condition_n2m.name - df[['conditions', 'id']] \ - .explode('conditions') \ - .rename(columns=columns).to_sql(table, index=False, if_exists='append', con=self.engine) + df[["conditions", "id"]].explode("conditions").rename(columns=columns).to_sql( + table, index=False, if_exists="append", con=self.engine + ) def insert_interventions(self, df): """Insert interventions into database.""" - columns = {'id': 'clinical_trials_gov_id', - 'interventions': 'clinical_trials_gov_intervention_id'} + columns = { + "id": "clinical_trials_gov_id", + "interventions": "clinical_trials_gov_intervention_id", + } table = clinical_trials_gov.ctg_intervention_n2m.name - df[['interventions', 'id']] \ - .explode('interventions') \ - .rename(columns=columns).to_sql(table, index=False, if_exists='append', con=self.engine) + df[["interventions", "id"]].explode("interventions").rename(columns=columns).to_sql( + table, index=False, if_exists="append", con=self.engine + ) def insert_data(self) -> Dict[str, int]: """Insert Clinical Trial metadata into database.""" self.recreate_tables() with ZipFile(self.file_path, "r") as zip_file: - xml_files = [x for x in zip_file.filelist if x.filename.endswith('.xml')] + xml_files = [x for x in zip_file.filelist if x.filename.endswith(".xml")] self.insert_n2m_tables() conditions = {x.condition: x.id for x in self.session.query(clinical_trials_gov.Condition).all()} keywords = {x.keyword: x.id for x in self.session.query(clinical_trials_gov.Keyword).all()} mesh_terms = {x.mesh_term: x.id for x in self.session.query(clinical_trials_gov.MeshTerm).all()} - interventions = {(x.intervention_type, x.intervention_name): x.id for x in - self.session.query(clinical_trials_gov.Intervention).all()} + interventions = { + (x.intervention_type, x.intervention_name): x.id + for x in self.session.query(clinical_trials_gov.Intervention).all() + } trials = [] index = 0 - for index, f in tqdm(enumerate(xml_files, 1), - desc=f"Import {self.biodb_name.upper()}", - total=len(xml_files)): - + for index, f in tqdm( + enumerate(xml_files, 1), + desc=f"Import {self.biodb_name.upper()}", + total=len(xml_files), + ): with zip_file.open(f.filename) as xml_file: xml_content = xml_file.read() doc = etree.fromstring(xml_content) data_dict = self.get_data_as_dict(doc, conditions, keywords, mesh_terms, interventions) - data_dict['id'] = index + data_dict["id"] = index trials.append(data_dict) if index % 10000 == 0: @@ -206,12 +215,22 @@ def insert_data(self) -> Dict[str, int]: def insert_trials(self, trials): """Insert select Clinical Trial entries into database.""" - cols_multi = ['primary_outcomes', 'secondary_outcomes', 'interventions', - 'mesh_terms', 'keywords', 'conditions'] + cols_multi = [ + "primary_outcomes", + "secondary_outcomes", + "interventions", + "mesh_terms", + "keywords", + "conditions", + ] df = pd.DataFrame(trials) cols = list(set(df.columns) - set(cols_multi)) - df[cols].to_sql(clinical_trials_gov.ClinicalTrialGov.__tablename__, self.engine, if_exists='append', - index=False) + df[cols].to_sql( + clinical_trials_gov.ClinicalTrialGov.__tablename__, + self.engine, + if_exists="append", + index=False, + ) self.insert_mesh_terms(df) self.insert_keywords(df) self.insert_conditions(df) @@ -219,51 +238,64 @@ def insert_trials(self, trials): def get_data_as_dict(self, doc, conditions, keywords, mesh_terms, interventions): """Get metadata as dict based on passed parameters.""" - d = {'primary_outcomes': [], 'secondary_outcomes': [], 'keywords': [], 'conditions': [], 'interventions': [], - 'mesh_terms': []} + d = { + "primary_outcomes": [], + "secondary_outcomes": [], + "keywords": [], + "conditions": [], + "interventions": [], + "mesh_terms": [], + } for child in doc: - if child.tag == 'id_info': - d['nct_id'] = self.__get_first(child.xpath('./nct_id[1]/text()')) - d['org_study_id'] = self.__get_first(child.xpath('./org_study_id[1]/text()')) - - elif child.tag in ['brief_title', 'official_title', 'overall_status', 'start_date', 'completion_date', - 'phase', 'study_type']: + if child.tag == "id_info": + d["nct_id"] = self.__get_first(child.xpath("./nct_id[1]/text()")) + d["org_study_id"] = self.__get_first(child.xpath("./org_study_id[1]/text()")) + + elif child.tag in [ + "brief_title", + "official_title", + "overall_status", + "start_date", + "completion_date", + "phase", + "study_type", + ]: d[child.tag] = child.text - elif child.tag in ['brief_summary', 'detailed_description']: - d[child.tag] = self.__get_first(child.xpath('./textblock[1]/text()')) + elif child.tag in ["brief_summary", "detailed_description"]: + d[child.tag] = self.__get_first(child.xpath("./textblock[1]/text()")) - elif child.tag == 'oversight_info': - d['is_fda_regulated_drug'] = self.__get_first(child.xpath('./is_fda_regulated_drug[1]/text()')) + elif child.tag == "oversight_info": + d["is_fda_regulated_drug"] = self.__get_first(child.xpath("./is_fda_regulated_drug[1]/text()")) - elif child.tag == 'condition': - d['conditions'].append(conditions[child.text]) + elif child.tag == "condition": + d["conditions"].append(conditions[child.text]) - elif child.tag == 'condition_browse': - d['mesh_terms'] = [mesh_terms[x] for x in child.xpath('./mesh_term//text()')] + elif child.tag == "condition_browse": + d["mesh_terms"] = [mesh_terms[x] for x in child.xpath("./mesh_term//text()")] - elif child.tag == 'study_design_info': - d['study_design_intervention_model'] = self.__get_first(child.xpath('./intervention_model[1]/text()')) - d['study_design_primary_purpose'] = self.__get_first(child.xpath('./primary_purpose[1]/text()')) - d['study_design_masking'] = self.__get_first(child.xpath('./masking[1]/text()')) + elif child.tag == "study_design_info": + d["study_design_intervention_model"] = self.__get_first(child.xpath("./intervention_model[1]/text()")) + d["study_design_primary_purpose"] = self.__get_first(child.xpath("./primary_purpose[1]/text()")) + d["study_design_masking"] = self.__get_first(child.xpath("./masking[1]/text()")) - elif child.tag in ['primary_outcome', 'secondary_outcome']: + elif child.tag in ["primary_outcome", "secondary_outcome"]: outcomes = dict() for cchild in child: outcomes[cchild.tag] = cchild.text d[child.tag + "s"] = outcomes - elif child.tag == 'keyword': - d['keywords'].append(keywords[child.text]) + elif child.tag == "keyword": + d["keywords"].append(keywords[child.text]) - elif child.tag == 'intervention': + elif child.tag == "intervention": itype, iname = self.__parse_intervention(child) formatted_intervention = interventions[(itype, iname)] - d['interventions'].append(formatted_intervention) + d["interventions"].append(formatted_intervention) - elif child.tag == 'patient_data': - d['patient_data_sharing_ipd'] = self.__get_first(child.xpath('./sharing_ipd[1]/text()')) - d['patient_data_ipd_description'] = self.__get_first(child.xpath('./ipd_description[1]/text()')) + elif child.tag == "patient_data": + d["patient_data_sharing_ipd"] = self.__get_first(child.xpath("./sharing_ipd[1]/text()")) + d["patient_data_ipd_description"] = self.__get_first(child.xpath("./ipd_description[1]/text()")) return d @@ -278,11 +310,14 @@ def update_pathology_links(self) -> int: trial_sql = "SELECT @rid.asString() FROM clinical_trial WHERE '{}' in mesh_conditions" update_sql = "UPDATE {} SET clinical_trials = {}" - paths = self.query_class(class_name='pathology', columns=['name'], ) + paths = self.query_class( + class_name="pathology", + columns=["name"], + ) updated = 0 for path in tqdm(paths, desc="Update pathology nodes"): - path_rid, path_name = path[RID], path['name'] + path_rid, path_name = path[RID], path["name"] trial_results = self.client.query(trial_sql.format(path_name)) trial_rids = [x.oRecordData[RID] for x in trial_results] diff --git a/ebel/manager/orientdb/biodbs/clinvar.py b/ebel/manager/orientdb/biodbs/clinvar.py index 49ce5e3..a5d0f47 100644 --- a/ebel/manager/orientdb/biodbs/clinvar.py +++ b/ebel/manager/orientdb/biodbs/clinvar.py @@ -1,28 +1,31 @@ """ClinVar.""" import logging -import pandas as pd +from collections import namedtuple +from typing import Dict, List -from tqdm import tqdm +import pandas as pd from pyorientdb import OrientDB -from typing import Dict, List -from collections import namedtuple +from tqdm import tqdm +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import CLINVAR -from ebel.manager.orientdb import odb_meta, urls, odb_structure -from ebel.tools import get_file_path, get_disease_trait_keywords_from_config - from ebel.manager.rdbms.models import clinvar - +from ebel.tools import get_disease_trait_keywords_from_config, get_file_path logger = logging.getLogger(__name__) -Snp = namedtuple('Snp', ('keyword', - 'phenotype', - 'rs_number', - 'hgnc_id', - 'chromosome', - 'position', - 'clinical_significance')) +Snp = namedtuple( + "Snp", + ( + "keyword", + "phenotype", + "rs_number", + "hgnc_id", + "chromosome", + "position", + "clinical_significance", + ), +) class ClinVar(odb_meta.Graph): @@ -34,12 +37,14 @@ def __init__(self, client: OrientDB = None): self.biodb_name = CLINVAR self.urls = {self.biodb_name: urls.CLINVAR} self.file_path = get_file_path(urls.CLINVAR, self.biodb_name) - super().__init__(nodes=odb_structure.clinvar_nodes, - edges=odb_structure.clinvar_edges, - indices=odb_structure.clinvar_indices, - urls=self.urls, - tables_base=clinvar.Base, - biodb_name=self.biodb_name) + super().__init__( + nodes=odb_structure.clinvar_nodes, + edges=odb_structure.clinvar_edges, + indices=odb_structure.clinvar_indices, + urls=self.urls, + tables_base=clinvar.Base, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -55,42 +60,86 @@ def insert_data(self) -> Dict[str, int]: df = pd.read_csv(self.file_path, sep="\t", low_memory=False) self._standardize_dataframe(df) df.index += 1 - df.index.rename('id', inplace=True) - df.drop(columns=['phenotype_ids', 'phenotype_list', 'other_ids']).to_sql( - self.biodb_name, self.engine, if_exists='append', chunksize=10000) - - df_clinvar__phenotype = df['phenotype_list'].str.split(r'[|;]').explode().to_frame() \ - .reset_index().rename(columns={'phenotype_list': 'phenotype', 'id': 'clinvar_id'}) + df.index.rename("id", inplace=True) + df.drop(columns=["phenotype_ids", "phenotype_list", "other_ids"]).to_sql( + self.biodb_name, self.engine, if_exists="append", chunksize=10000 + ) + + df_clinvar__phenotype = ( + df["phenotype_list"] + .str.split(r"[|;]") + .explode() + .to_frame() + .reset_index() + .rename(columns={"phenotype_list": "phenotype", "id": "clinvar_id"}) + ) df_clinvar__phenotype.index += 1 - df_phenotype = pd.DataFrame(df_clinvar__phenotype.phenotype.unique(), columns=['phenotype']) + df_phenotype = pd.DataFrame(df_clinvar__phenotype.phenotype.unique(), columns=["phenotype"]) df_phenotype.index += 1 - df_phenotype.index.rename('id', inplace=True) - df_phenotype.to_sql(clinvar.ClinvarPhenotype.__tablename__, self.engine, if_exists='append') + df_phenotype.index.rename("id", inplace=True) + df_phenotype.to_sql( + clinvar.ClinvarPhenotype.__tablename__, + self.engine, + if_exists="append", + chunksize=10000, + ) inserted.update({clinvar.ClinvarPhenotype.__tablename__: df_phenotype.shape[0]}) - df_clinvar__phenotype = df_clinvar__phenotype.set_index('phenotype').join( - df_phenotype.assign(clinvar_phenotype_id=df_phenotype.index).set_index('phenotype') - ).reset_index().loc[:, ['clinvar_id', 'clinvar_phenotype_id']] + df_clinvar__phenotype = ( + df_clinvar__phenotype.set_index("phenotype") + .join(df_phenotype.assign(clinvar_phenotype_id=df_phenotype.index).set_index("phenotype")) + .reset_index() + .loc[:, ["clinvar_id", "clinvar_phenotype_id"]] + ) df_clinvar__phenotype.index += 1 - df_clinvar__phenotype.index.rename('id', inplace=True) - df_clinvar__phenotype.to_sql('clinvar__phenotype', self.engine, if_exists='append', index=False) - inserted.update({clinvar.clinvar__clinvar_phenotype.__dict__['fullname']: df_clinvar__phenotype.shape[0]}) - - clinvar_pheno_medgen = df['phenotype_ids'].str.split(r'[|,;]').explode().str.partition(':')[[0, 2]] \ - .rename(columns={0: 'db', 2: 'identifier'}) - clinvar_pheno_medgen['clinvar_id'] = clinvar_pheno_medgen.index - clinvar_phenotype_medgen = clinvar_pheno_medgen.set_index('db').loc['MedGen'].reset_index().drop( - columns=['db']) + df_clinvar__phenotype.index.rename("id", inplace=True) + df_clinvar__phenotype.to_sql( + "clinvar__phenotype", + self.engine, + if_exists="append", + index=False, + chunksize=10000, + ) + inserted.update({clinvar.clinvar__clinvar_phenotype.__dict__["fullname"]: df_clinvar__phenotype.shape[0]}) + + clinvar_pheno_medgen = ( + df["phenotype_ids"] + .str.split(r"[|,;]") + .explode() + .str.partition(":")[[0, 2]] + .rename(columns={0: "db", 2: "identifier"}) + ) + clinvar_pheno_medgen["clinvar_id"] = clinvar_pheno_medgen.index + clinvar_phenotype_medgen = ( + clinvar_pheno_medgen.set_index("db").loc["MedGen"].reset_index().drop(columns=["db"]) + ) clinvar_phenotype_medgen.index += 1 - clinvar_phenotype_medgen.index.rename('id', inplace=True) - clinvar_phenotype_medgen.to_sql(clinvar.ClinvarPhenotypeMedgen.__tablename__, self.engine, if_exists='append') + clinvar_phenotype_medgen.index.rename("id", inplace=True) + clinvar_phenotype_medgen.to_sql( + clinvar.ClinvarPhenotypeMedgen.__tablename__, + self.engine, + if_exists="append", + chunksize=10000, + ) inserted.update({clinvar.ClinvarPhenotypeMedgen.__tablename__: clinvar_phenotype_medgen.shape[0]}) - df_other_ids = df.other_ids.str.split(',').dropna().explode().str.partition(':')[[0, 2]].reset_index().rename( - columns={0: 'db', 2: 'identifier', 'id': 'clinvar_id'}).rename_axis("id") + df_other_ids = ( + df.other_ids.str.split(",") + .dropna() + .explode() + .str.partition(":")[[0, 2]] + .reset_index() + .rename(columns={0: "db", 2: "identifier", "id": "clinvar_id"}) + .rename_axis("id") + ) df_other_ids.index += 1 - df_other_ids.to_sql(clinvar.ClinvarOtherIdentifier.__tablename__, self.engine, if_exists='append') + df_other_ids.to_sql( + clinvar.ClinvarOtherIdentifier.__tablename__, + self.engine, + if_exists="append", + chunksize=10000, + ) inserted.update({clinvar.ClinvarOtherIdentifier.__tablename__: df_other_ids.shape[0]}) return inserted @@ -135,14 +184,17 @@ def get_disease_snps_dict(self) -> Dict[str, List[Snp]]: def update_interactions(self) -> int: """Create SNPs and edges using information from ClinVar.""" # snp is upstream of a downstream gene - snp_type = {'mapped': "mapped", 'downstream': "upstream", 'upstream': "downstream"} + snp_type = { + "mapped": "mapped", + "downstream": "upstream", + "upstream": "downstream", + } added_edges = 0 disease_snps_dict = self.get_disease_snps_dict() hgnc_id_gene_rid_cache = {} for disease, rows in disease_snps_dict.items(): - - for snp in tqdm(rows, desc=f'Add has_X_snp_cv edges to BEL for {disease}'): + for snp in tqdm(rows, desc=f"Add has_X_snp_cv edges to BEL for {disease}"): if snp.hgnc_id in hgnc_id_gene_rid_cache: gene_mapped_rid = hgnc_id_gene_rid_cache[snp.hgnc_id] else: @@ -150,31 +202,37 @@ def update_interactions(self) -> int: hgnc_id_gene_rid_cache[snp.hgnc_id] = gene_mapped_rid if gene_mapped_rid: - snp_rid = self.get_create_rid('snp', {'rs_number': "rs" + str(snp.rs_number)}) - value_dict = {'clinical_significance': snp.clinical_significance, - 'phenotype': snp.phenotype, - 'keyword': snp.keyword} - self.create_edge(class_name='has_mapped_snp_cv', - from_rid=gene_mapped_rid, - to_rid=snp_rid, - value_dict=value_dict, - if_not_exists=True) + snp_rid = self.get_create_rid("snp", {"rs_number": "rs" + str(snp.rs_number)}) + value_dict = { + "clinical_significance": snp.clinical_significance, + "phenotype": snp.phenotype, + "keyword": snp.keyword, + } + self.create_edge( + class_name="has_mapped_snp_cv", + from_rid=gene_mapped_rid, + to_rid=snp_rid, + value_dict=value_dict, + if_not_exists=True, + ) added_edges += 1 # fetch all down and upstream gene_rids gene_type_rids = self.get_set_gene_rids_by_position( chromosome=snp.chromosome, position=snp.position, - gene_types=['downstream', 'upstream'] + gene_types=["downstream", "upstream"], ) for gene_type, gene_rids in gene_type_rids.items(): for gene_rid in gene_rids: class_name = f"has_{snp_type[gene_type]}_snp_cv" - self.create_edge(class_name=class_name, - from_rid=gene_rid, - to_rid=snp_rid, - value_dict=value_dict) + self.create_edge( + class_name=class_name, + from_rid=gene_rid, + to_rid=snp_rid, + value_dict=value_dict, + ) added_edges += 1 return added_edges @@ -182,16 +240,17 @@ def _get_set_gene_rid(self, hgnc_id: str): """Insert gene (if not exists) and returns OrientDB @rid.""" gene_rid = None - hgnc_rids = self.query_class('hgnc', columns=['symbol'], limit=1, id=hgnc_id) + hgnc_rids = self.query_class("hgnc", columns=["symbol"], limit=1, id=hgnc_id) if hgnc_rids: gene = hgnc_rids[0] bel = f'g(HGNC:"{gene["symbol"]}")' - data = {'pure': True, - 'bel': bel, - 'name': gene['symbol'], - 'namespace': "HGNC" - } - gene_rid = self.get_create_rid('gene', data, check_for='bel') + data = { + "pure": True, + "bel": bel, + "name": gene["symbol"], + "namespace": "HGNC", + } + gene_rid = self.get_create_rid("gene", data, check_for="bel") return gene_rid diff --git a/ebel/manager/orientdb/biodbs/disgenet.py b/ebel/manager/orientdb/biodbs/disgenet.py index 88e992f..c1d27f7 100644 --- a/ebel/manager/orientdb/biodbs/disgenet.py +++ b/ebel/manager/orientdb/biodbs/disgenet.py @@ -1,16 +1,15 @@ """DisGeNet.""" import logging -import pandas as pd +from typing import Dict, Optional -from tqdm import tqdm -from typing import Dict +import pandas as pd from pyorientdb import OrientDB +from tqdm import tqdm +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import DISGENET -from ebel.manager.orientdb import odb_meta, urls, odb_structure -from ebel.tools import get_file_path, get_disease_trait_keywords_from_config - from ebel.manager.rdbms.models import disgenet +from ebel.tools import get_disease_trait_keywords_from_config, get_file_path logger = logging.getLogger(__name__) @@ -18,18 +17,20 @@ class DisGeNet(odb_meta.Graph): """DisGeNet (https://www.disgenet.org).""" - def __init__(self, client: OrientDB = None): + def __init__(self, client: Optional[OrientDB] = None): """Init DisGeNet.""" self.client = client self.biodb_name = DISGENET self.urls = { - 'disgenet_gene': urls.DISGENET_GDP_ASSOC, - 'disgenet_variant': urls.DISGENET_VDP_ASSOC, + "disgenet_gene": urls.DISGENET_GDP_ASSOC, + "disgenet_variant": urls.DISGENET_VDP_ASSOC, } - super().__init__(tables_base=disgenet.Base, - edges=odb_structure.disgenet_edges, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + tables_base=disgenet.Base, + edges=odb_structure.disgenet_edges, + urls=self.urls, + biodb_name=self.biodb_name, + ) self.disease_keywords = get_disease_trait_keywords_from_config() @@ -43,22 +44,18 @@ def __contains__(self, item): def __repr__(self) -> str: """Represent DisGeNet as string.""" template = "{{BioDatabase:DisGeNet}}[url:{url}, edges:{edges}, generics:{generics}]" - representation = template.format( - url=self.urls, - edges=self.number_of_edges, - generics=self.number_of_generics - ) + representation = template.format(url=self.urls, edges=self.number_of_edges, generics=self.number_of_generics) return representation def insert_data(self) -> Dict[str, int]: """Insert data into database.""" logger.info(f"Import {self.biodb_name.upper()}") inserted = dict() - inserted['sources'] = self._insert_sources() - inserted['gene_symbols'] = self._insert_gene_symbols() - inserted['gene_disease_names'] = self._insert_disease_names() - inserted['gene_disease_pmid_associations'] = self._insert_gene_disease_pmid_associations() - inserted['variant_disease_pmid_associations'] = self._insert_variant_disease_pmid_associations() + inserted["sources"] = self._insert_sources() + inserted["gene_symbols"] = self._insert_gene_symbols() + inserted["gene_disease_names"] = self._insert_disease_names() + inserted["gene_disease_pmid_associations"] = self._insert_gene_disease_pmid_associations() + inserted["variant_disease_pmid_associations"] = self._insert_variant_disease_pmid_associations() return inserted def __get_file_for_model(self, model): @@ -76,65 +73,81 @@ def file_path_variant(self): return self.__get_file_for_model(disgenet.DisgenetVariant) def _insert_sources(self): - df_g = pd.read_csv(self.file_path_gene, sep="\t", usecols=['source']).drop_duplicates() - df_v = pd.read_csv(self.file_path_variant, sep="\t", usecols=['source']).drop_duplicates() + df_g = pd.read_csv(self.file_path_gene, sep="\t", usecols=["source"]).drop_duplicates() + df_v = pd.read_csv(self.file_path_variant, sep="\t", usecols=["source"]).drop_duplicates() df = pd.concat([df_g, df_v]).drop_duplicates() df.reset_index(inplace=True, drop=True) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(disgenet.DisgenetSource.__tablename__, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(disgenet.DisgenetSource.__tablename__, self.engine, if_exists="append") return df.shape[0] def _insert_disease_names(self) -> int: - columns_disease = {'diseaseId': 'disease_id', 'diseaseName': 'disease_name'} + columns_disease = {"diseaseId": "disease_id", "diseaseName": "disease_name"} - df_gene = pd.read_csv(self.file_path_gene, sep="\t", usecols=columns_disease.keys()) \ - .rename(columns=columns_disease) \ - .drop_duplicates().set_index('disease_id') + df_gene = ( + pd.read_csv(self.file_path_gene, sep="\t", usecols=list(columns_disease.keys())) + .rename(columns=columns_disease) + .drop_duplicates() + .set_index("disease_id") + ) - df_variant = pd.read_csv(self.file_path_variant, sep="\t", usecols=columns_disease.keys()) \ - .rename(columns=columns_disease) \ - .drop_duplicates().set_index('disease_id') + df_variant = ( + pd.read_csv(self.file_path_variant, sep="\t", usecols=list(columns_disease.keys())) + .rename(columns=columns_disease) + .drop_duplicates() + .set_index("disease_id") + ) df_concat = pd.concat([df_gene, df_variant]).drop_duplicates() - df_concat.to_sql(disgenet.DisgenetDisease.__tablename__, self.engine, if_exists='append') + df_concat.to_sql(disgenet.DisgenetDisease.__tablename__, self.engine, if_exists="append") return df_concat.shape[0] def _insert_gene_symbols(self) -> int: - columns_gene_symols = {'geneId': "gene_id", 'geneSymbol': "gene_symbol"} - df = pd.read_csv(self.file_path_gene, sep="\t", usecols=columns_gene_symols.keys()) \ - .rename(columns=columns_gene_symols) \ - .drop_duplicates().set_index('gene_id') - df.to_sql(disgenet.DisgenetGeneSymbol.__tablename__, self.engine, if_exists='append') + columns_gene_symols = {"geneId": "gene_id", "geneSymbol": "gene_symbol"} + df = ( + pd.read_csv(self.file_path_gene, sep="\t", usecols=list(columns_gene_symols.keys())) + .rename(columns=columns_gene_symols) + .drop_duplicates() + .set_index("gene_id") + ) + df.to_sql(disgenet.DisgenetGeneSymbol.__tablename__, self.engine, if_exists="append") return df.shape[0] def _merge_with_source(self, df): - df_sources = pd.read_sql_table(disgenet.DisgenetSource.__tablename__, self.engine) \ - .rename(columns={'id': 'source_id'}) - return pd.merge(df, df_sources, on="source").drop(columns=['source']) + df_sources = pd.read_sql_table(disgenet.DisgenetSource.__tablename__, self.engine).rename( + columns={"id": "source_id"} + ) + return pd.merge(df, df_sources, on="source").drop(columns=["source"]) def _insert_gene_disease_pmid_associations(self) -> int: - usecols_gene = ['geneId', 'diseaseId', 'score', 'pmid', 'source'] + usecols_gene = ["geneId", "diseaseId", "score", "pmid", "source"] rename_dict = dict(zip(usecols_gene, self._standardize_column_names(usecols_gene))) - df = pd.read_csv(self.file_path_gene, sep="\t", usecols=usecols_gene) \ - .rename(columns=rename_dict) + df = pd.read_csv(self.file_path_gene, sep="\t", usecols=usecols_gene).rename(columns=rename_dict) df = self._merge_with_source(df) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(disgenet.DisgenetGene.__tablename__, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(disgenet.DisgenetGene.__tablename__, self.engine, if_exists="append") return df.shape[0] - def _insert_variant_disease_pmid_associations(self) -> Dict[str, int]: - usecols_variant = ['snpId', 'chromosome', 'position', 'diseaseId', 'score', 'pmid', 'source'] + def _insert_variant_disease_pmid_associations(self) -> int: + usecols_variant = [ + "snpId", + "chromosome", + "position", + "diseaseId", + "score", + "pmid", + "source", + ] rename_dict = dict(zip(usecols_variant, self._standardize_column_names(usecols_variant))) - df = pd.read_csv(self.file_path_variant, sep="\t", usecols=usecols_variant) \ - .rename(columns=rename_dict) + df = pd.read_csv(self.file_path_variant, sep="\t", usecols=usecols_variant).rename(columns=rename_dict) df = self._merge_with_source(df) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(disgenet.DisgenetVariant.__tablename__, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(disgenet.DisgenetVariant.__tablename__, self.engine, if_exists="append") return df.shape[0] @@ -143,7 +156,7 @@ def update_interactions(self) -> int: self.clear_nodes_and_edges() # self.update_diseases() inserted = self.update_snps() - self.delete_nodes_with_no_edges('snp') + self.delete_nodes_with_no_edges("snp") return inserted def update_diseases(self): @@ -156,7 +169,11 @@ def update_diseases(self): def update_snps(self) -> int: """Update SNP information.""" - snp_type = {'mapped': "mapped", 'downstream': "upstream", 'upstream': "downstream"} + snp_type = { + "mapped": "mapped", + "downstream": "upstream", + "upstream": "downstream", + } # TODO: replace SQL with SQL Alchemy statement sql_temp = """Select snp_id, @@ -190,35 +207,40 @@ def update_snps(self) -> int: inserted = 0 - snps = {x['rs_number']: x['rid'] for x in self.query_class('snp', columns=['rs_number'])} + snps = {x["rs_number"]: x["rid"] for x in self.query_class("snp", columns=["rs_number"])} for trait, kwd_disease_results in results.items(): - for r in tqdm(kwd_disease_results, - desc=f'Update DisGeNET variant interactions for {trait}', - total=kwd_disease_results.rowcount): + for r in tqdm( + kwd_disease_results, + desc=f"Update DisGeNET variant interactions for {trait}", + total=kwd_disease_results.rowcount, + ): snp_id, chromosome, position, disease_name, pmid, score, source = r if snp_id in snps: snp_rid = snps[snp_id] else: - snp_rid = self.insert_record(class_name='snp', value_dict={'rs_number': snp_id}) + snp_rid = self.insert_record(class_name="snp", value_dict={"rs_number": snp_id}) snps[snp_id] = snp_rid gene_type_rids = self.get_set_gene_rids_by_position(chromosome, position) for gene_type, gene_rids in gene_type_rids.items(): for gene_rid in gene_rids: - value_dict = {'disease_name': disease_name, - 'score': score, - 'source': source, - 'pmid': pmid - } + value_dict = { + "disease_name": disease_name, + "score": score, + "source": source, + "pmid": pmid, + } class_name = f"has_{snp_type[gene_type]}_snp_dgn" - self.create_edge(class_name=class_name, - from_rid=gene_rid, - to_rid=snp_rid, - value_dict=value_dict) + self.create_edge( + class_name=class_name, + from_rid=gene_rid, + to_rid=snp_rid, + value_dict=value_dict, + ) inserted += 1 return inserted diff --git a/ebel/manager/orientdb/biodbs/drugbank.py b/ebel/manager/orientdb/biodbs/drugbank.py index e2f63ec..eb6973e 100644 --- a/ebel/manager/orientdb/biodbs/drugbank.py +++ b/ebel/manager/orientdb/biodbs/drugbank.py @@ -1,112 +1,143 @@ """DrugBank.""" +import getpass +import logging import os +import os.path import platform import re import signal -import getpass -import os.path -import logging -import requests - +from collections import namedtuple +from configparser import ConfigParser +from datetime import datetime from time import time -from tqdm import tqdm -from typing import Dict +from typing import Dict, Optional from zipfile import ZipFile -from datetime import datetime -from pyorientdb import OrientDB + +import requests from lxml.etree import iterparse -from collections import namedtuple -from configparser import ConfigParser +from pyorientdb import OrientDB +from tqdm import tqdm +from ebel.config import get_config_as_dict, write_to_config from ebel.constants import DATA_DIR, RID from ebel.defaults import config_file_path +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import DRUGBANK -from ebel.config import write_to_config, get_config_as_dict -from ebel.manager.orientdb import odb_meta, urls, odb_structure - from ebel.manager.rdbms.models import drugbank -ACTIONS = 'actions' -SYMBOLS = 'symbols' -KNOWN_ACTION = 'known_action' +ACTIONS = "actions" +SYMBOLS = "symbols" +KNOWN_ACTION = "known_action" logger = logging.getLogger(__name__) -XML_URL = 'http://www.drugbank.ca' -XML_NAMESPACE = f'{{{XML_URL}}}' +XML_URL = "http://www.drugbank.ca" +XML_NAMESPACE = f"{{{XML_URL}}}" XN = {"n": XML_URL} -Xpath = namedtuple('Xpath', ('references', 'synonyms', 'product_names', 'pathways', 'target_known_action', - 'ex_ids_resourec', 'ex_ids_id', 'target_actions', 'target_uniprot')) +Xpath = namedtuple( + "Xpath", + ( + "references", + "synonyms", + "product_names", + "pathways", + "target_known_action", + "ex_ids_resourec", + "ex_ids_id", + "target_actions", + "target_uniprot", + ), +) class DrugBank(odb_meta.Graph): """Drugbank.""" - def __init__(self, client: OrientDB = None): + def __init__(self, client: Optional[OrientDB] = None): """Drugbank database. documentation: https://www.drugbank.ca/documentation """ self.client = client self.biodb_name = DRUGBANK - self.urls = {'drugbank_version': urls.DRUGBANK_VERSION, self.biodb_name: urls.DRUGBANK_DATA} - super().__init__(nodes=odb_structure.drugbank_nodes, - edges=odb_structure.drugbank_edges, - tables_base=drugbank.Base, - indices=odb_structure.drugbank_indices, - urls=self.urls, - biodb_name=self.biodb_name) + self.urls = { + "drugbank_version": urls.DRUGBANK_VERSION, + self.biodb_name: urls.DRUGBANK_DATA, + } + super().__init__( + nodes=odb_structure.drugbank_nodes, + edges=odb_structure.drugbank_edges, + tables_base=drugbank.Base, + indices=odb_structure.drugbank_indices, + urls=self.urls, + biodb_name=self.biodb_name, + ) self.biodb_dir = os.path.join(DATA_DIR, self.biodb_name) os.makedirs(self.biodb_dir, exist_ok=True) - self.file_path = os.path.join(self.biodb_dir, 'drugbank_all_full_database.xml.zip') - self.file_name_unzipped = 'full database.xml' + self.file_path = os.path.join(self.biodb_dir, "drugbank_all_full_database.xml.zip") + self.file_name_unzipped = "full database.xml" self.file_path_unzipped = os.path.join(self.biodb_dir, self.file_name_unzipped) self.xpath_pattern = Xpath( - references='./n:articles/n:article/n:pubmed-id/text()', - synonyms='./n:synonym/text()', - product_names='./n:product/n:name/text()', - ex_ids_resourec='./n:resource[1]/text()', - ex_ids_id='./n:identifier[1]/text()', - target_actions='./n:actions[1]/n:action[1]/text()', - target_known_action='./n:known-action[1]/text()', + references="./n:articles/n:article/n:pubmed-id/text()", + synonyms="./n:synonym/text()", + product_names="./n:product/n:name/text()", + ex_ids_resourec="./n:resource[1]/text()", + ex_ids_id="./n:identifier[1]/text()", + target_actions="./n:actions[1]/n:action[1]/text()", + target_known_action="./n:known-action[1]/text()", target_uniprot="./n:polypeptide[@source='Swiss-Prot']/@id", - pathways='./n:pathway/n:smpdb-id/text()' + pathways="./n:pathway/n:smpdb-id/text()", ) def __len__(self): """Get number of 'has_drug_target' graph edges.""" - return self.execute("Select count(*) from has_drug_target")[0].oRecordData['count'] + return self.execute("Select count(*) from has_drug_target")[0].oRecordData["count"] def __contains__(self, unique_identifier): """Check if drug with identifier '?' exists in graph.""" pass def _unzip_file(self): - logger.info(f'unzip {self.file_path}') - with ZipFile(self.file_path, 'r') as zip_ref: + logger.info(f"unzip {self.file_path}") + with ZipFile(self.file_path, "r") as zip_ref: zip_ref.extract(self.file_name_unzipped, self.biodb_dir) def insert_data(self) -> Dict[str, int]: """Insert data and returns number of inserts.""" - logger.info('Insert DrugBank data') + logger.info("Insert DrugBank data") self._unzip_file() drug_index = 0 - columns = ['name', 'description', 'cas-number', 'unii', 'state', 'indication', 'pharmacodynamics', - 'toxicity', 'metabolism', 'absorption', 'half-life', 'route-of-elimination', - 'volume-of-distribution', 'clearance', 'mechanism-of-action', 'fda-label'] - - doc = iterparse(self.file_path_unzipped, events=('end',), tag=f'{XML_NAMESPACE}drug') + columns = [ + "name", + "description", + "cas-number", + "unii", + "state", + "indication", + "pharmacodynamics", + "toxicity", + "metabolism", + "absorption", + "half-life", + "route-of-elimination", + "volume-of-distribution", + "clearance", + "mechanism-of-action", + "fda-label", + ] + + doc = iterparse(self.file_path_unzipped, events=("end",), tag=f"{XML_NAMESPACE}drug") for action, elem in tqdm(doc, desc=f"Import {self.biodb_name.upper()}"): if "type" in elem.attrib: drug_index += 1 - drug = {'id': drug_index} + drug = {"id": drug_index} references, synonyms = None, None targets = [] external_identifiers = [] @@ -117,41 +148,42 @@ def insert_data(self) -> Dict[str, int]: pathways = [] for child in elem.iterchildren(): - ctag = child.tag[len(XML_NAMESPACE):] + ctag = child.tag[len(XML_NAMESPACE) :] if ctag in columns: - drug[ctag.replace('-', '_')] = child.text + drug[ctag.replace("-", "_")] = child.text - elif ctag == 'drugbank-id': - primary_id = child.attrib.get('primary') + elif ctag == "drugbank-id": + primary_id = child.attrib.get("primary") if primary_id: - drug['drugbank_id'] = child.text + drug["drugbank_id"] = child.text - elif ctag == 'general-references': + elif ctag == "general-references": pmid_strs = child.xpath(self.xpath_pattern.references, namespaces=XN) references = [drugbank.Reference(pmid=int(x)) for x in pmid_strs] - elif ctag == 'synonyms': + elif ctag == "synonyms": syns = child.xpath(self.xpath_pattern.synonyms, namespaces=XN) synonyms = [drugbank.Synonym(synonym=x) for x in set(syns)] - elif ctag == 'products': + elif ctag == "products": pro_names = child.xpath(self.xpath_pattern.product_names, namespaces=XN) product_names = [drugbank.ProductName(name=x) for x in set(pro_names)] - elif ctag == 'drug-interactions': + elif ctag == "drug-interactions": for di_child in child: - di = {x.tag[len(XML_NAMESPACE):].replace('-', '_'): x.text for x in di_child} + di = {x.tag[len(XML_NAMESPACE) :].replace("-", "_"): x.text for x in di_child} drug_interactions.append(drugbank.DrugInteraction(**di)) - elif ctag == 'external-identifiers': + elif ctag == "external-identifiers": for ex_ids_child in child: resource = ex_ids_child.xpath(self.xpath_pattern.ex_ids_resourec, namespaces=XN)[0] identifier = ex_ids_child.xpath(self.xpath_pattern.ex_ids_id, namespaces=XN)[0] - external_identifiers.append(drugbank.ExternalIdentifier(resource=resource, - identifier=identifier)) + external_identifiers.append( + drugbank.ExternalIdentifier(resource=resource, identifier=identifier) + ) - elif ctag == 'targets': + elif ctag == "targets": for target in child: u = target.xpath(self.xpath_pattern.target_uniprot, namespaces=XN) uniprot = u[0] if u else None @@ -160,30 +192,39 @@ def insert_data(self) -> Dict[str, int]: actions = target.xpath(self.xpath_pattern.target_actions, namespaces=XN) action = actions[0] if actions else None - kactions = target.xpath(self.xpath_pattern.target_known_action, namespaces=XN) + kactions = target.xpath( + self.xpath_pattern.target_known_action, + namespaces=XN, + ) known_action = kactions[0] if kactions else None targets.append( - drugbank.Target(uniprot=uniprot, action=action, known_action=known_action)) - - elif ctag == 'patents': + drugbank.Target( + uniprot=uniprot, + action=action, + known_action=known_action, + ) + ) + + elif ctag == "patents": for patent in child: patent_dict = {} for patent_child in patent: - patent_key = patent_child.tag[len(XML_NAMESPACE):].replace('-', '_') + patent_key = patent_child.tag[len(XML_NAMESPACE) :].replace("-", "_") patent_value = patent_child.text - if patent_key in ('expires', 'approved') and re.search(r'^\d{4}-\d{2}-\d{2}$', - patent_value): - patent_value = datetime.strptime(patent_value.strip(), '%Y-%m-%d').date() + if patent_key in ("expires", "approved") and re.search( + r"^\d{4}-\d{2}-\d{2}$", patent_value + ): + patent_value = datetime.strptime(patent_value.strip(), "%Y-%m-%d").date() patent_dict[patent_key] = patent_value if patent_dict: patents.append(drugbank.Patent(**patent_dict)) - elif ctag == 'groups': + elif ctag == "groups": for group in child: statuses.append(drugbank.Status(status=group.text)) - elif ctag == 'pathways': + elif ctag == "pathways": pws = child.xpath(self.xpath_pattern.pathways, namespaces=XN) pathways = [drugbank.Pathway(smpdb_id=x) for x in pws] @@ -199,7 +240,7 @@ def insert_data(self) -> Dict[str, int]: statuses=statuses, patents=patents, pathways=pathways, - **drug + **drug, ) self.session.add(drugbank_instance) # insert in chunks of ... @@ -239,7 +280,11 @@ def timeout_error(*_): signal.signal(signal.SIGALRM, signal.SIG_IGN) return None - def get_user_passwd(self, drugbank_user: str = None, drugbank_password: str = None) -> list: + def get_user_passwd( + self, + drugbank_user: Optional[str] = None, + drugbank_password: Optional[str] = None, + ) -> Optional[list]: """Read username and password from configuration file.""" section_name = "DRUGBANK" conf = None @@ -258,12 +303,11 @@ def get_user_passwd(self, drugbank_user: str = None, drugbank_password: str = No conf = get_config_as_dict()[section_name] if not section_exists and (drugbank_user and drugbank_password): - conf = {'user': drugbank_user, 'password': drugbank_password} - write_to_config(section_name, 'user', drugbank_user) - write_to_config(section_name, 'password', drugbank_password) + conf = {"user": drugbank_user, "password": drugbank_password} + write_to_config(section_name, "user", drugbank_user) + write_to_config(section_name, "password", drugbank_password) if not section_exists or not config_exists: - prompt = "Do you have an approved account with DrugBank [y/n]: " timeout_msg = "No answer was provided. Skipping DrugBank update...\n" timeout = 20 @@ -272,8 +316,7 @@ def get_user_passwd(self, drugbank_user: str = None, drugbank_password: str = No num_tries = 0 max_tries = 4 - while num_tries < max_tries and answer not in ['yes', 'y', 'no', 'n'] and answer is not None: - + while num_tries < max_tries and answer not in ["yes", "y", "no", "n"] and answer is not None: print("Invalid response!\n") answer = self.get_timed_answer(prompt=prompt, timeout=timeout, timeout_msg=timeout_msg) num_tries += 1 @@ -282,19 +325,19 @@ def get_user_passwd(self, drugbank_user: str = None, drugbank_password: str = No print("Too many invalid responses. Skipping DrugBank update...\n") num_tries += 1 - if answer in ['yes', 'y']: + if answer in ["yes", "y"]: user = input("Insert DrugBank user name:\n") passwd = getpass.getpass("Insert DrugBank password:\n") - write_to_config(section_name, 'user', user) - write_to_config(section_name, 'password', passwd) + write_to_config(section_name, "user", user) + write_to_config(section_name, "password", passwd) conf = get_config_as_dict()[section_name] - elif answer in ['no', 'n'] or num_tries >= max_tries or answer is None: # If no, write 'NA' to config - write_to_config(section_name, 'user', 'NA') - write_to_config(section_name, 'password', 'NA') + elif answer in ["no", "n"] or num_tries >= max_tries or answer is None: # If no, write 'NA' to config + write_to_config(section_name, "user", "NA") + write_to_config(section_name, "password", "NA") - if conf and conf['user'] != 'NA': # Return None if user in config is 'NA' - return [conf['user'], conf['password']] + if conf and conf["user"] != "NA": # Return None if user in config is 'NA' + return [conf["user"], conf["password"]] def insert(self) -> Dict[str, int]: """Check if files missing for download or generic table empty. If True then insert data.""" @@ -302,14 +345,17 @@ def insert(self) -> Dict[str, int]: self.download() if os.path.exists(self.file_path): - drugbank_table_exists = self.engine.dialect.has_table(self.engine.connect(), - drugbank.Drugbank.__tablename__) + drugbank_table_exists = self.engine.dialect.has_table( + self.engine.connect(), drugbank.Drugbank.__tablename__ + ) if not drugbank_table_exists or self.session.query(drugbank.Drugbank.id).count() == 0: self.recreate_tables() inserted.update(self.insert_data()) else: - logger.error('Drugbank data can not be inserted because the file does not exist. Please download manually ' - f'and move to {self.biodb_dir}') + logger.error( + "Drugbank data can not be inserted because the file does not exist. Please download manually " + f"and move to {self.biodb_dir}" + ) return inserted def download(self) -> dict: @@ -333,7 +379,7 @@ def download(self) -> dict: response = requests.get(file_to_download, auth=(user, passwd)) if response.ok and response.status_code == 200: # The file is not empty - with open(self.file_path, 'wb') as drugbank_data_file: + with open(self.file_path, "wb") as drugbank_data_file: drugbank_data_file.write(response.content) downloaded = True @@ -344,19 +390,19 @@ def download(self) -> dict: def __latest_version(self) -> str: """Gets latest version of Drugbank and adds it to URL.""" - webpage = requests.get(self.urls['drugbank_version']) + webpage = requests.get(self.urls["drugbank_version"]) version = re.findall(r"(DrugBank Release Version) (\d\.\d\.\d)", webpage.text)[0][1] return self.urls[self.biodb_name].format(version.replace(".", "-")) def get_drugbank_id_rids(self) -> Dict[str, str]: """Get dict of DrugBank IDs as keys and their rIDs as values.""" - rows = self.execute("Select drugbank_id, @rid.asString() as rid from drug") - return {x.oRecordData['drugbank_id']: x.oRecordData[RID] for x in rows} + rows = self.execute("Select drugbank_id, @rid.asString() as rid from drug") + return {x.oRecordData["drugbank_id"]: x.oRecordData[RID] for x in rows if "drugbank_id" in x.oRecordData} @staticmethod def _replace_new_lines(input_string: str) -> str: if input_string and input_string.strip(): - return input_string.strip().replace('\r\n', '
').replace('\n', '
') + return input_string.strip().replace("\r\n", "
").replace("\n", "
") return input_string def update_interactions(self) -> int: @@ -365,7 +411,7 @@ def update_interactions(self) -> int: drugbank_table_exists = self.engine.dialect.has_table(self.engine.connect(), drugbank.Drugbank.__tablename__) if not drugbank_table_exists: - logger.error('Update failed - DrugBank table does not exist.') + logger.error("Update failed - DrugBank table does not exist.") return 0 drugbank_id_rids = self.get_drugbank_id_rids() @@ -376,7 +422,7 @@ def update_interactions(self) -> int: WHERE pure=true and uniprot IS NOT NULL AND namespace = 'HGNC'""" for row in tqdm(self.execute(sql), desc=f"Update {self.biodb_name.upper()} interaction."): r = row.oRecordData - protein_rid, uniprot = r['rid'], r['uniprot'] + protein_rid, uniprot = r["rid"], r["uniprot"] query = self.session.query(drugbank.Target).filter(drugbank.Target.uniprot == uniprot) for target in query.all(): drugbank_id = target.drugbank.drugbank_id @@ -385,28 +431,30 @@ def update_interactions(self) -> int: drug_rid = drugbank_id_rids[drugbank_id] else: value_dict_drug = { - 'drugbank_id': target.drugbank.drugbank_id, - 'label': target.drugbank.name, - 'description': self._replace_new_lines(target.drugbank.description), - 'cas_number': target.drugbank.cas_number, - 'indication': self._replace_new_lines(target.drugbank.indication), - 'pharmacodynamics': self._replace_new_lines(target.drugbank.pharmacodynamics), - 'toxicity': self._replace_new_lines(target.drugbank.toxicity), - 'metabolism': self._replace_new_lines(target.drugbank.metabolism), - 'mechanism_of_action': self._replace_new_lines(target.drugbank.mechanism_of_action) + "drugbank_id": target.drugbank.drugbank_id, + "label": target.drugbank.name, + "description": self._replace_new_lines(target.drugbank.description), + "cas_number": target.drugbank.cas_number, + "indication": self._replace_new_lines(target.drugbank.indication), + "pharmacodynamics": self._replace_new_lines(target.drugbank.pharmacodynamics), + "toxicity": self._replace_new_lines(target.drugbank.toxicity), + "metabolism": self._replace_new_lines(target.drugbank.metabolism), + "mechanism_of_action": self._replace_new_lines(target.drugbank.mechanism_of_action), } - drug_rid = self.insert_record('drug_db', value_dict=value_dict_drug) + drug_rid = self.insert_record("drug_db", value_dict=value_dict_drug) drugbank_id_rids[drugbank_id] = drug_rid # update cache value_dict_edge = { - 'action': target.action, - 'known_action': target.known_action + "action": target.action, + "known_action": target.known_action, } - self.create_edge('has_drug_target_db', - from_rid=drug_rid, - to_rid=protein_rid, - value_dict=value_dict_edge) + self.create_edge( + "has_drug_target_db", + from_rid=drug_rid, + to_rid=protein_rid, + value_dict=value_dict_edge, + ) inserted += 1 return inserted diff --git a/ebel/manager/orientdb/biodbs/ensembl.py b/ebel/manager/orientdb/biodbs/ensembl.py index a0c1dbf..0134bab 100644 --- a/ebel/manager/orientdb/biodbs/ensembl.py +++ b/ebel/manager/orientdb/biodbs/ensembl.py @@ -1,16 +1,15 @@ """Ensembl.""" -import re import gzip -import pandas as pd - +import re from typing import Dict + +import pandas as pd from pyorientdb import OrientDB -from ebel.tools import get_file_path from ebel.manager.orientdb import odb_meta, urls from ebel.manager.orientdb.constants import ENSEMBL - from ebel.manager.rdbms.models import ensembl +from ebel.tools import get_file_path # TODO: use ftp://ftp.ensembl.org/pub/release-102/mysql/homo_sapiens_core_102_38/gene.txt.gz instead CDs @@ -23,9 +22,7 @@ def __init__(self, client: OrientDB = None): self.client = client self.biodb_name = ENSEMBL self.urls = {self.biodb_name: urls.ENSEMBL_CDS} - super().__init__(urls=self.urls, - tables_base=ensembl.Base, - biodb_name=self.biodb_name) + super().__init__(urls=self.urls, tables_base=ensembl.Base, biodb_name=self.biodb_name) def __len__(self): return self.session.query(ensembl.Ensembl).count() @@ -41,16 +38,18 @@ def insert_data(self) -> Dict[str, int]: with gzip.open(file_path, "r") as f: lines = [x.decode("utf-8").strip() for x in f.readlines() if x.startswith(b">")] - regex = (r"^>(?PENST\d+)\.\d+ cds chromosome:GRCh" - r"(?P\d+):" - r"(?P((1|2)?\d|X|Y|MT)):" - r"(?P\d+):" - r"(?P\d+):" - r"(?P-?1) gene:" - r"(?P(?PENSG\d+)\.\d+) " - r"gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:" - r"(?P\S+) .*? \[Source:HGNC Symbol;Acc:" - r"(?PHGNC:\d+)\]") + regex = ( + r"^>(?PENST\d+)\.\d+ cds chromosome:GRCh" + r"(?P\d+):" + r"(?P((1|2)?\d|X|Y|MT)):" + r"(?P\d+):" + r"(?P\d+):" + r"(?P-?1) gene:" + r"(?P(?PENSG\d+)\.\d+) " + r"gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:" + r"(?P\S+) .*? \[Source:HGNC Symbol;Acc:" + r"(?PHGNC:\d+)\]" + ) pattern = re.compile(regex) for line in lines: found = pattern.search(line) @@ -59,8 +58,8 @@ def insert_data(self) -> Dict[str, int]: df = pd.DataFrame(data) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(ensembl.Ensembl.__tablename__, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(ensembl.Ensembl.__tablename__, self.engine, if_exists="append") self.session.commit() diff --git a/ebel/manager/orientdb/biodbs/expression_atlas.py b/ebel/manager/orientdb/biodbs/expression_atlas.py index 1ebd0aa..cdafe88 100644 --- a/ebel/manager/orientdb/biodbs/expression_atlas.py +++ b/ebel/manager/orientdb/biodbs/expression_atlas.py @@ -3,11 +3,11 @@ import os import sys from collections import OrderedDict -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import pandas as pd -from pandas.core.frame import DataFrame import xmltodict +from pandas.core.frame import DataFrame from pyorientdb import OrientDB from tqdm import tqdm @@ -25,13 +25,15 @@ class ExpressionAtlas(odb_meta.Graph): def __init__(self, client: OrientDB = None): """Init ExpressionAtlas.""" self.client = client - self.biodb_name = 'expression_atlas' - self.urls = {'latest_data': urls.EXPRESSION_ATLAS_EXPERIMENTS} + self.biodb_name = "expression_atlas" + self.urls = {"latest_data": urls.EXPRESSION_ATLAS_EXPERIMENTS} self.data_dir = os.path.join(DATA_DIR, self.biodb_name) - super().__init__(tables_base=expression_atlas.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + tables_base=expression_atlas.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -44,7 +46,7 @@ def update(self): """Update ExpressionAtlas.""" logger.info("Update ExpressionAtlas") downloaded = self.download() - if downloaded['latest_data']: + if downloaded["latest_data"]: self.extract_files() self.insert() @@ -52,15 +54,16 @@ def extract_files(self): """Extract relevant files.""" os.chdir(self.data_dir) cmd_temp = "tar -xzf atlas-latest-data.tar.gz --wildcards --no-anchored '{}'" - patterns = ['*.sdrf.txt', - '*.condensed-sdrf.tsv', - '*analytics.tsv', - '*-configuration.xml', - '*.idf.txt', - '*.go.gsea.tsv', - '*.interpro.gsea.tsv', - '*.reactome.gsea.tsv' - ] + patterns = [ + "*.sdrf.txt", + "*.condensed-sdrf.tsv", + "*analytics.tsv", + "*-configuration.xml", + "*.idf.txt", + "*.go.gsea.tsv", + "*.interpro.gsea.tsv", + "*.reactome.gsea.tsv", + ] with tqdm(patterns) as t_patterns: for pattern in t_patterns: t_patterns.set_description(f"Extract files with pattern {pattern}") @@ -104,7 +107,7 @@ def insert(self): df_configuration = self.get_configuration(experiment_name) if isinstance(df_configuration, pd.DataFrame): df_idf = self.get_idf(experiment_name) - title = df_idf[df_idf.key_name == 'investigation_title'].value.values[0] + title = df_idf[df_idf.key_name == "investigation_title"].value.values[0] experiment_id = self.insert_experiment(experiment_name, title) @@ -121,10 +124,14 @@ def insert(self): sys.exit() def __insert_foldchange(self, experiment_id: int, experiment_name: str, groups_strs: Tuple[str, ...]): - df_log2foldchange = self.get_log2foldchange(experiment_name, groups_strs).set_index('group_comparison') - df_group_comparison = self.get_df_group_comparison(experiment_id, groups_strs).set_index('group_comparison') - df_log2foldchange.join(df_group_comparison).to_sql(expression_atlas.FoldChange.__tablename__, - self.engine, if_exists='append', index=False) + df_log2foldchange = self.get_log2foldchange(experiment_name, groups_strs).set_index("group_comparison") + df_group_comparison = self.get_df_group_comparison(experiment_id, groups_strs).set_index("group_comparison") + df_log2foldchange.join(df_group_comparison).to_sql( + expression_atlas.FoldChange.__tablename__, + self.engine, + if_exists="append", + index=False, + ) def get_df_group_comparison(self, experiment_id: int, groups_strs: Tuple[str, ...]) -> pd.DataFrame: """Get group comparison IDs and group comparison columns for pairs of group strings. @@ -142,31 +149,48 @@ def get_df_group_comparison(self, experiment_id: int, groups_strs: Tuple[str, .. """ data = [] for groups_str in groups_strs: - group_comparison_id = self.session.query(expression_atlas.GroupComparison.id).filter_by( - experiment_id=experiment_id, - group_comparison=groups_str).first().id + group_comparison_id = ( + self.session.query(expression_atlas.GroupComparison.id) + .filter_by(experiment_id=experiment_id, group_comparison=groups_str) + .first() + .id + ) data.append((group_comparison_id, groups_str)) - return pd.DataFrame(data, columns=['group_comparison_id', 'group_comparison']) + return pd.DataFrame(data, columns=["group_comparison_id", "group_comparison"]) def __insert_configuration(self, df_configuration, experiment_id: int) -> Tuple[str, ...]: - df_configuration['experiment_id'] = experiment_id - df_configuration.to_sql(expression_atlas.GroupComparison.__tablename__, self.engine, if_exists='append', - index=False) + df_configuration["experiment_id"] = experiment_id + df_configuration.to_sql( + expression_atlas.GroupComparison.__tablename__, + self.engine, + if_exists="append", + index=False, + ) groups_strs = tuple(df_configuration.group_comparison.values) self.session.flush() self.session.commit() return groups_strs def __insert_idf(self, df_idf: DataFrame, experiment_id: int): - df_idf['experiment_id'] = experiment_id - df_idf.to_sql(expression_atlas.Idf.__tablename__, self.engine, if_exists='append', index=False) + df_idf["experiment_id"] = experiment_id + df_idf.to_sql( + expression_atlas.Idf.__tablename__, + self.engine, + if_exists="append", + index=False, + ) def __insert_sdrf_condensed(self, experiment_id: int, experiment_name: str): df = self.get_sdrf_condensed(experiment_name) # organisms = tuple(df[df.parameter == 'organism'].value.unique()) - df['experiment_id'] = experiment_id - df.drop(columns=['experiment'], inplace=True) - df.to_sql(expression_atlas.SdrfCondensed.__tablename__, self.engine, if_exists='append', index=False) + df["experiment_id"] = experiment_id + df.drop(columns=["experiment"], inplace=True) + df.to_sql( + expression_atlas.SdrfCondensed.__tablename__, + self.engine, + if_exists="append", + index=False, + ) def insert_gseas(self, experiment_id: int, experiment_name: str, groups_strs: Tuple[str, ...]): """Insert Gene set enrichment analysis. @@ -180,7 +204,12 @@ def insert_gseas(self, experiment_id: int, experiment_name: str, groups_strs: Tu """ df = self.get_gseas(experiment_name, experiment_id, groups_strs) if isinstance(df, pd.DataFrame): - df.to_sql(expression_atlas.Gsea.__tablename__, self.engine, if_exists='append', index=False) + df.to_sql( + expression_atlas.Gsea.__tablename__, + self.engine, + if_exists="append", + index=False, + ) def get_gseas(self, experiment_name: str, experiment_id: int, groups_strs: Tuple[str]) -> Optional[pd.DataFrame]: """Get GSEA data. @@ -200,11 +229,11 @@ def get_gseas(self, experiment_name: str, experiment_id: int, groups_strs: Tuple """ dfs = [] for groups_str in groups_strs: - for gsea_type in ['go', 'reactome', 'interpro']: + for gsea_type in ["go", "reactome", "interpro"]: df = self.get_gsea(experiment_name, groups_str, gsea_type) if isinstance(df, pd.DataFrame): - df['gsea_type'] = gsea_type - df['group_comparison_id'] = self.__get_group_comparison_id(groups_str, experiment_id) + df["gsea_type"] = gsea_type + df["group_comparison_id"] = self.__get_group_comparison_id(groups_str, experiment_id) dfs.append(df[df.p_adj_non_dir <= 0.05]) if dfs: return pd.concat(dfs) @@ -229,9 +258,11 @@ def get_gsea(self, experiment_name: str, groups_str: str, gsea_type: str) -> Opt ------- Returns a pandas DataFrame of GSEA information if file exists matching the passed parameters. """ - file_path = os.path.join(self.data_dir, - experiment_name, - f"{experiment_name}.{groups_str}.{gsea_type}.gsea.tsv") + file_path = os.path.join( + self.data_dir, + experiment_name, + f"{experiment_name}.{groups_str}.{gsea_type}.gsea.tsv", + ) if not os.path.exists(file_path): return @@ -239,7 +270,7 @@ def get_gsea(self, experiment_name: str, groups_str: str, gsea_type: str) -> Opt df = pd.read_csv(file_path, sep="\t") df.columns = [get_standard_name(x) for x in df.columns] - if 'term' in df.columns: + if "term" in df.columns: return df def get_log2foldchange(self, experiment_name: str, groups_strs: Tuple[str]) -> pd.DataFrame: @@ -260,23 +291,28 @@ def get_log2foldchange(self, experiment_name: str, groups_strs: Tuple[str]) -> p df_analyticss = self.get_analyticss(experiment_name) for df_analytics in df_analyticss: for g in groups_strs: - col_p_value = f'{g}_p_value' - col_log2foldchange = f'{g}_log2foldchange' + col_p_value = f"{g}_p_value" + col_log2foldchange = f"{g}_log2foldchange" if {col_p_value, col_log2foldchange}.issubset(df_analytics.columns): - cols = ['gene_id', 'gene_name', col_p_value, col_log2foldchange] - rename_map = {col_p_value: 'p_value', col_log2foldchange: 'log2foldchange'} + cols = ["gene_id", "gene_name", col_p_value, col_log2foldchange] + rename_map = { + col_p_value: "p_value", + col_log2foldchange: "log2foldchange", + } - if f'{g}_t_statistic' in df_analytics.columns: - cols.append(f'{g}_t_statistic') - rename_map[f'{g}_t_statistic'] = 't_statistic' + if f"{g}_t_statistic" in df_analytics.columns: + cols.append(f"{g}_t_statistic") + rename_map[f"{g}_t_statistic"] = "t_statistic" df = df_analytics[cols].rename(columns=rename_map) - df['group_comparison'] = g + df["group_comparison"] = g dfs.append(df) df_concat = pd.concat(dfs) - return df_concat[(df_concat.p_value <= 0.05) - & df_concat.gene_name.notnull() - & ((df_concat.log2foldchange <= -1) | (df_concat.log2foldchange >= 1))] + return df_concat[ + (df_concat.p_value <= 0.05) + & df_concat.gene_name.notnull() + & ((df_concat.log2foldchange <= -1) | (df_concat.log2foldchange >= 1)) + ] def get_idf(self, experiment_name: str) -> Optional[pd.DataFrame]: """Get Data from IDF by experiment name. @@ -298,14 +334,14 @@ def get_idf(self, experiment_name: str) -> Optional[pd.DataFrame]: rows = [] for line in open(file_path): - line_splitted = line.strip().split('\t') + line_splitted = line.strip().split("\t") if len(line_splitted) > 1: key_name = get_standard_name(line_splitted[0]) values = [x.strip() for x in line_splitted[1:] if x.strip()] rows.append((key_name, values)) - df = pd.DataFrame(rows, columns=('key_name', 'value')).explode('value') + df = pd.DataFrame(rows, columns=("key_name", "value")).explode("value") return df def get_sdrf_condensed(self, experiment_name: str) -> Optional[pd.DataFrame]: @@ -325,7 +361,15 @@ def get_sdrf_condensed(self, experiment_name: str) -> Optional[pd.DataFrame]: if not os.path.exists(file_path): return - names = ['experiment', 'method', 'sample', 'parameter_type', 'parameter', 'value', 'url'] + names = [ + "experiment", + "method", + "sample", + "parameter_type", + "parameter", + "value", + "url", + ] df = pd.read_csv(file_path, sep="\t", header=None, names=names) return df @@ -346,7 +390,7 @@ def get_analyticss(self, experiment_name: str) -> List[pd.DataFrame]: dfs = [] if analytics_tsv_paths: for analytics_tsv_path in analytics_tsv_paths: - df = pd.read_csv(analytics_tsv_path, sep='\t') + df = pd.read_csv(analytics_tsv_path, sep="\t") df.columns = [get_standard_name(x) for x in df.columns] dfs.append(df) return dfs @@ -368,19 +412,18 @@ def get_configuration(self, experiment_name: str) -> Optional[DataFrame]: if not os.path.exists(file_path): return - config = xmltodict.parse(open(file_path).read())['configuration'] - - if config['@experimentType'] != 'rnaseq_mrna_baseline': + config = xmltodict.parse(open(file_path).read())["configuration"] + if config["@experimentType"] != "rnaseq_mrna_baseline": compare_dict = {} - ca = config['analytics'] + ca = config["analytics"] ca_items = ca if isinstance(ca, list) else [ca] for item in ca_items: - for _, contrast in item['contrasts'].items(): + for _, contrast in item["contrasts"].items(): if isinstance(contrast, (OrderedDict, list)): contrasts = contrast if isinstance(contrast, list) else [contrast] for ind_contrast in contrasts: - compare_dict[ind_contrast['@id']] = ind_contrast['name'] + compare_dict[ind_contrast["@id"]] = ind_contrast["name"] - return pd.DataFrame(compare_dict.items(), columns=['group_comparison', 'name']) + return pd.DataFrame(compare_dict.items(), columns=["group_comparison", "name"]) diff --git a/ebel/manager/orientdb/biodbs/gwas_catalog.py b/ebel/manager/orientdb/biodbs/gwas_catalog.py index 5975f70..414ee8d 100644 --- a/ebel/manager/orientdb/biodbs/gwas_catalog.py +++ b/ebel/manager/orientdb/biodbs/gwas_catalog.py @@ -1,20 +1,19 @@ """GwasCatalog implementation. Depends on HGNC.""" import logging +from typing import Dict + import numpy as np import pandas as pd - -from tqdm import tqdm -from typing import Dict from pyorientdb import OrientDB +from tqdm import tqdm from ebel.constants import RID +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.hgnc import Hgnc from ebel.manager.orientdb.constants import GWAS_CATALOG -from ebel.manager.orientdb import odb_meta, urls, odb_structure -from ebel.tools import get_disease_trait_keywords_from_config, get_file_path - from ebel.manager.rdbms.models import gwas_catalog +from ebel.tools import get_disease_trait_keywords_from_config, get_file_path logger = logging.getLogger(__name__) @@ -22,22 +21,29 @@ class GwasCatalog(odb_meta.Graph): """GWAS Catalog (EBI).""" - def __init__(self, client: OrientDB = None, - disease_trait_keyword: str = "Alzheimer", overwrite_config: bool = False): + def __init__( + self, + client: OrientDB = None, + disease_trait_keyword: str = "Alzheimer", + overwrite_config: bool = False, + ): """Init GwasCatalog.""" self.snp_cache = {} self._ensembl_gene_rid_dict: Dict[str:str] = {} - self.disease_keywords = get_disease_trait_keywords_from_config(disease_trait_keyword, - overwrite=overwrite_config) + self.disease_keywords = get_disease_trait_keywords_from_config( + disease_trait_keyword, overwrite=overwrite_config + ) self.client = client self.biodb_name = GWAS_CATALOG self.url = urls.GWAS_CATALOG self.urls = {self.biodb_name: self.url} - super().__init__(nodes=odb_structure.gwascatalog_nodes, - edges=odb_structure.gwascatalog_edges, - tables_base=gwas_catalog.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + nodes=odb_structure.gwascatalog_nodes, + edges=odb_structure.gwascatalog_edges, + tables_base=gwas_catalog.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self) -> int: """Get number of edges in OrientDB.""" @@ -50,10 +56,12 @@ def __contains__(self, rs_number: int) -> bool: def insert_data(self) -> Dict[str, int]: """Insert GwasCatalog data in generic table `gwascatalog`.""" # TODO: inform data provider about problems in dataset - df = pd.read_csv(get_file_path(self.urls[self.biodb_name], self.biodb_name), - sep="\t", - low_memory=False, - on_bad_lines='warn') + df = pd.read_csv( + get_file_path(self.urls[self.biodb_name], self.biodb_name), + sep="\t", + low_memory=False, + on_bad_lines="warn", + ) df.columns = self._standardize_column_names(df.columns) df.replace(np.inf, np.nan, inplace=True) # Get rid of infinity # replace non-dates with np.nan @@ -61,27 +69,27 @@ def insert_data(self) -> Dict[str, int]: # df.loc[non_date, 'date_added_to_catalog'] = np.nan df.index += 1 - df.rename(columns={'snps': 'snp'}, inplace=True) - df.index.rename('id', inplace=True) + df.rename(columns={"snps": "snp"}, inplace=True) + df.index.rename("id", inplace=True) df.upstream_gene_id = df.upstream_gene_id.str.strip() df.downstream_gene_id = df.downstream_gene_id.str.strip() - columns_main_table = [x for x in df.columns if x != 'snp_gene_ids'] + columns_main_table = [x for x in df.columns if x != "snp_gene_ids"] table_name = gwas_catalog.GwasCatalog.__tablename__ - df[columns_main_table].to_sql(table_name, self.engine, if_exists='append') + df[columns_main_table].to_sql(table_name, self.engine, if_exists="append") df.snp_gene_ids = df.snp_gene_ids.str.strip().str.split(", ") df[table_name + "_id"] = df.index - df_snp_gene_ids = df[[table_name + "_id", 'snp_gene_ids']].explode('snp_gene_ids') + df_snp_gene_ids = df[[table_name + "_id", "snp_gene_ids"]].explode("snp_gene_ids") df_snp_gene_ids.dropna(inplace=True) df_snp_gene_ids.index = range(1, df_snp_gene_ids.shape[0] + 1) - df_snp_gene_ids.rename(columns={'snp_gene_ids': 'ensembl_identifier'}, inplace=True) - df_snp_gene_ids.index.rename('id', inplace=True) - df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists='append') + df_snp_gene_ids.rename(columns={"snp_gene_ids": "ensembl_identifier"}, inplace=True) + df_snp_gene_ids.index.rename("id", inplace=True) + df_snp_gene_ids.to_sql(gwas_catalog.SnpGene.__tablename__, self.engine, if_exists="append") self.session.commit() @@ -105,9 +113,9 @@ def update_interactions(self) -> Dict[str, int]: inserted = {} logger.info(f"Update interactions for {self.biodb_name}") self.clear_edges() - inserted['has_mapped_snp_gc'] = self.update_mapped_genes() - inserted['has_downstream_snp_gc'] = self.update_upstream_genes() - inserted['has_upstream_snp_gc'] = self.update_downstream_genes() + inserted["has_mapped_snp_gc"] = self.update_mapped_genes() + inserted["has_downstream_snp_gc"] = self.update_upstream_genes() + inserted["has_upstream_snp_gc"] = self.update_downstream_genes() logger.info(f"Successfully updated interactions for {self.biodb_name}") return inserted @@ -115,16 +123,20 @@ def update_mapped_genes(self) -> int: """Update mapped gene information.""" inserted = 0 for disease_keyword in self.disease_keywords: - query_results = self.session.query(gwas_catalog.SnpGene) \ - .join(gwas_catalog.GwasCatalog) \ - .filter(gwas_catalog.GwasCatalog.disease_trait.like(f'%{disease_keyword}%')).with_entities( - gwas_catalog.SnpGene.ensembl_identifier, - gwas_catalog.GwasCatalog.snp, - gwas_catalog.GwasCatalog.disease_trait, - gwas_catalog.GwasCatalog.pubmedid - ).all() - - inserted += self.insert_snps(query_results, 'has_mapped_snp_gc', disease_keyword) + query_results = ( + self.session.query(gwas_catalog.SnpGene) + .join(gwas_catalog.GwasCatalog) + .filter(gwas_catalog.GwasCatalog.disease_trait.like(f"%{disease_keyword}%")) + .with_entities( + gwas_catalog.SnpGene.ensembl_identifier, + gwas_catalog.GwasCatalog.snp, + gwas_catalog.GwasCatalog.disease_trait, + gwas_catalog.GwasCatalog.pubmedid, + ) + .all() + ) + + inserted += self.insert_snps(query_results, "has_mapped_snp_gc", disease_keyword) return inserted def update_upstream_genes(self): @@ -133,16 +145,17 @@ def update_upstream_genes(self): for disease_keyword in self.disease_keywords: query = self.session.query(gwas_catalog.GwasCatalog).filter( - gwas_catalog.GwasCatalog.disease_trait.like(f'%{disease_keyword}%'), - gwas_catalog.GwasCatalog.upstream_gene_id.isnot(None)) + gwas_catalog.GwasCatalog.disease_trait.like(f"%{disease_keyword}%"), + gwas_catalog.GwasCatalog.upstream_gene_id.isnot(None), + ) query_results = query.with_entities( gwas_catalog.GwasCatalog.upstream_gene_id, gwas_catalog.GwasCatalog.snp, gwas_catalog.GwasCatalog.disease_trait, - gwas_catalog.GwasCatalog.pubmedid + gwas_catalog.GwasCatalog.pubmedid, ).all() - inserted += self.insert_snps(query_results, 'has_downstream_snp_gc', disease_keyword) + inserted += self.insert_snps(query_results, "has_downstream_snp_gc", disease_keyword) return inserted def update_downstream_genes(self): @@ -151,50 +164,53 @@ def update_downstream_genes(self): for disease_keyword in self.disease_keywords: query = self.session.query(gwas_catalog.GwasCatalog).filter( - gwas_catalog.GwasCatalog.disease_trait.like(f'%{disease_keyword}%'), - gwas_catalog.GwasCatalog.downstream_gene_id.isnot(None)) + gwas_catalog.GwasCatalog.disease_trait.like(f"%{disease_keyword}%"), + gwas_catalog.GwasCatalog.downstream_gene_id.isnot(None), + ) query_results = query.with_entities( gwas_catalog.GwasCatalog.downstream_gene_id, gwas_catalog.GwasCatalog.snp, gwas_catalog.GwasCatalog.disease_trait, - gwas_catalog.GwasCatalog.pubmedid + gwas_catalog.GwasCatalog.pubmedid, ).all() - inserted += self.insert_snps(query_results, 'has_upstream_snp_gc', disease_keyword) + inserted += self.insert_snps(query_results, "has_upstream_snp_gc", disease_keyword) return inserted def insert_snps(self, query_results, edge_class: str, keyword: str): """Insert SNP metadata into database.""" - columns = ['ensembl_gene_id', 'snp', 'disease_trait', 'pubmedid'] + columns = ["ensembl_gene_id", "snp", "disease_trait", "pubmedid"] snps = pd.DataFrame(query_results, columns=columns) - snps.set_index('ensembl_gene_id', inplace=True) + snps.set_index("ensembl_gene_id", inplace=True) inserted = 0 - desc = f'Update {edge_class} for {keyword}' + desc = f"Update {edge_class} for {keyword}" for ensembl_gene_id, row in tqdm(snps.iterrows(), total=snps.shape[0], desc=desc): snp_rid = self._get_set_snp_rid(row.snp) - value_dict = {'disease_trait': row.disease_trait, 'pubmed_id': row.pubmedid} + value_dict = {"disease_trait": row.disease_trait, "pubmed_id": row.pubmedid} gene_rid = self._get_gene_rid(ensembl_id=ensembl_gene_id) if gene_rid: - self.create_edge(edge_class, - gene_rid, - snp_rid, - value_dict=value_dict, - if_not_exists=True) + self.create_edge( + edge_class, + gene_rid, + snp_rid, + value_dict=value_dict, + if_not_exists=True, + ) inserted += 1 return inserted def _get_set_snp_rid(self, rs_number: str) -> str: """Insert snp (if not exists) and returns OrientDB @rid.""" - results = self.query_class(class_name='snp', limit=1, columns=[], rs_number=rs_number) + results = self.query_class(class_name="snp", limit=1, columns=[], rs_number=rs_number) if results: rid = results[0][RID] else: - content = {'rs_number': rs_number} - rid = self.insert_record('snp', content) + content = {"rs_number": rs_number} + rid = self.insert_record("snp", content) return rid @@ -202,10 +218,11 @@ def _get_set_snp_rid(self, rs_number: str) -> str: def ensembl_gene_rid_dict(self): """Return dict of ensembl IDs as keys and their rIDs as values.""" if not self._ensembl_gene_rid_dict: - sql = "Select hgnc.ensembl_gene_id as ensembl, @rid.asString() as rid from gene " \ - "where hgnc.ensembl_gene_id IS NOT NULL and namespace='HGNC' and pure=true" - self._ensembl_gene_rid_dict = {r['ensembl']: r[RID] for r in - [x.oRecordData for x in self.execute(sql)]} + sql = ( + "Select hgnc.ensembl_gene_id as ensembl, @rid.asString() as rid from gene " + "where hgnc.ensembl_gene_id IS NOT NULL and namespace='HGNC' and pure=true" + ) + self._ensembl_gene_rid_dict = {r["ensembl"]: r[RID] for r in [x.oRecordData for x in self.execute(sql)]} return self._ensembl_gene_rid_dict def _get_gene_rid(self, ensembl_id: str): @@ -214,19 +231,17 @@ def _get_gene_rid(self, ensembl_id: str): if ensembl_id in self._ensembl_gene_rid_dict: rid = self._ensembl_gene_rid_dict[ensembl_id] else: - hgnc_found = self.query_class('hgnc', - columns=['symbol'], - limit=1, - ensembl_gene_id=ensembl_id) + hgnc_found = self.query_class("hgnc", columns=["symbol"], limit=1, ensembl_gene_id=ensembl_id) if hgnc_found: r = hgnc_found[0] bel = f'g(HGNC:"{r["symbol"]}")' - data = {'pure': True, - 'bel': bel, - 'name': r['symbol'], - 'namespace': "HGNC", - 'hgnc': r[RID] - } - rid = self.get_create_rid('gene', data, check_for='bel') + data = { + "pure": True, + "bel": bel, + "name": r["symbol"], + "namespace": "HGNC", + "hgnc": r[RID], + } + rid = self.get_create_rid("gene", data, check_for="bel") self._ensembl_gene_rid_dict[ensembl_id] = rid return rid diff --git a/ebel/manager/orientdb/biodbs/hgnc.py b/ebel/manager/orientdb/biodbs/hgnc.py index b6470bf..8140544 100644 --- a/ebel/manager/orientdb/biodbs/hgnc.py +++ b/ebel/manager/orientdb/biodbs/hgnc.py @@ -1,76 +1,75 @@ """HGNC.""" -import re -import sys import json import logging +import re +import sys +from collections import namedtuple +from typing import Dict, Optional import numpy as np import pandas as pd - -from tqdm import tqdm -from typing import Dict from pyorientdb import OrientDB -from collections import namedtuple +from tqdm import tqdm -from ebel.tools import get_file_path +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import HGNC -from ebel.manager.orientdb import odb_meta, urls, odb_structure - from ebel.manager.rdbms.models import hgnc +from ebel.tools import get_file_path logger = logging.getLogger(__name__) -HgncEntry4Update = namedtuple("HgncEntrySimple", ['hgnc_rid', 'label', 'location', 'symbol', 'suggested_corrections']) +HgncEntry4Update = namedtuple( + "HgncEntry4Update", + ["hgnc_rid", "label", "location", "symbol", "suggested_corrections"], +) class Hgnc(odb_meta.Graph): """HGNC class definition.""" - def __init__(self, client: OrientDB = None): + def __init__(self, client: Optional[OrientDB] = None): """Init HGNC.""" self.client = client self.biodb_name = HGNC - self.urls = {self.biodb_name: urls.HGNC_JSON, 'human_ortholog': urls.HCOP_GZIP} - super().__init__(generics=odb_structure.hgnc_generics, - tables_base=hgnc.Base, - indices=odb_structure.hgnc_indices, - nodes=odb_structure.hgnc_nodes, - urls=self.urls, - biodb_name=self.biodb_name) + self.urls = {self.biodb_name: urls.HGNC_JSON, "human_ortholog": urls.HCOP_GZIP} + super().__init__( + generics=odb_structure.hgnc_generics, + tables_base=hgnc.Base, + indices=odb_structure.hgnc_indices, + nodes=odb_structure.hgnc_nodes, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __contains__(self, hgnc_id: object) -> bool: """Test existence of hgnc_id.""" if isinstance(hgnc_id, int): hgnc_id = "HGNC:{}".format(hgnc_id) r = self.execute("Select count(*) from bel where hgnc.id = '{}' limit 1".format(hgnc_id)) - return bool(len(r[0].oRecordData['count'])) + return bool(len(r[0].oRecordData["count"])) def __len__(self): """Count number of hgnc links in BEL graph.""" r = self.execute("Select count(*) from bel where hgnc IS NOT NULL") - return r[0].oRecordData['count'] + return r[0].oRecordData["count"] def __repr__(self) -> str: """Represent HGNC.""" template = "{{BioDatabase:Hgnc}}[url:{url}, nodes:{nodes}, generics:{generics}]" - representation = template.format( - url=self.urls, - nodes=self.number_of_nodes, - generics=self.number_of_generics - ) + representation = template.format(url=self.urls, nodes=self.number_of_nodes, generics=self.number_of_generics) return representation def insert_data(self) -> Dict[str, int]: """Check if files missing for download or generic table empty. If True then insert data.""" inserted = dict() - inserted['hgnc'] = self.import_hgnc() - inserted['hgnc_rdbms'] = self.import_hgnc_into_rdbms() - inserted['human_orthologs'] = self.insert_orthologs() + inserted["hgnc"] = self.import_hgnc() + inserted["hgnc_rdbms"] = self.import_hgnc_into_rdbms() + inserted["human_orthologs"] = self.insert_orthologs() self.session.commit() return inserted def import_hgnc_into_rdbms(self) -> int: """Insert HGNC database into RDBMS.""" - logger.info('Insert HGNC database into RDBMS.') + logger.info("Insert HGNC database into RDBMS.") file_path = get_file_path(self.urls[self.biodb_name], self.biodb_name) with open(file_path, "r", encoding="utf8") as hgnc_file: @@ -78,55 +77,81 @@ def import_hgnc_into_rdbms(self) -> int: string_encode = raw_content.encode("ascii", "ignore") # Convert unicode chars to ascii hgnc_content = json.loads(string_encode.decode()) - df = pd.DataFrame(hgnc_content['response']['docs']) + df = pd.DataFrame(hgnc_content["response"]["docs"]) self._standardize_dataframe(df) - columns = ['hgnc_id', 'version', 'bioparadigms_slc', 'cd', 'cosmic', 'date_approved_reserved', 'date_modified', - 'date_name_changed', 'date_symbol_changed', 'ensembl_gene_id', 'entrez_id', 'homeodb', 'horde_id', - 'imgt', 'intermediate_filament_db', 'iuphar', 'lncipedia', 'lncrnadb', - 'location', 'location_sortable', 'locus_group', 'locus_type', 'mamit_trnadb', 'merops', 'mirbase', - 'name', 'orphanet', 'pseudogene_org', 'snornabase', 'status', 'symbol', 'ucsc_id', 'uuid', - 'vega_id', 'agr'] - - df['id'] = pd.to_numeric(df.hgnc_id.str.split(':').str[1]) - df.set_index('id', inplace=True) - df[columns].to_sql(hgnc.Hgnc.__tablename__, self.engine, if_exists='append') - - df.hgnc_id = pd.to_numeric(df.hgnc_id.str.split(':').str[1]) - - for df_col, model, m_col in (('prev_symbol', hgnc.PrevSymbol, None), - ('alias_symbol', hgnc.AliasSymbol, None), - ('alias_name', hgnc.AliasName, None), - ('ccds_id', hgnc.Ccds, 'identifier'), - ('ena', hgnc.Ena, 'identifier'), - ('enzyme_id', hgnc.Enzyme, 'ec_number'), - ('gene_group', hgnc.GeneGroupName, 'name'), - ('gene_group_id', hgnc.GeneGroupId, 'identifier'), - ('uniprot_ids', hgnc.UniProt, 'accession'), - ('rna_central_id', hgnc.RnaCentral, 'identifier'), - ('rgd_id', hgnc.Rgd, 'identifier'), - ('refseq_accession', hgnc.RefSeq, 'accession'), - ('pubmed_id', hgnc.PubMed, 'pmid'), - ('prev_name', hgnc.PrevName, None), - ('omim_id', hgnc.Omim, 'identifier'), - ('mgd_id', hgnc.Mgd, 'identifier'), - ('lsdb', hgnc.Lsdb, 'identifier')): - df_1n_table = df[[df_col, 'hgnc_id']].explode(df_col).dropna() + columns = [ + "hgnc_id", + "version", + "bioparadigms_slc", + "cd", + "cosmic", + "date_approved_reserved", + "date_modified", + "date_name_changed", + "date_symbol_changed", + "ensembl_gene_id", + "entrez_id", + "homeodb", + "horde_id", + "imgt", + "iuphar", + "lncipedia", + "lncrnadb", + "location", + "location_sortable", + "locus_group", + "locus_type", + "merops", + "mirbase", + "name", + "orphanet", + "snornabase", + "status", + "symbol", + "ucsc_id", + "uuid", + "vega_id", + "agr", + ] + + df["id"] = pd.to_numeric(df.hgnc_id.str.split(":").str[1]) + df.set_index("id", inplace=True) + df[columns].to_sql(hgnc.Hgnc.__tablename__, self.engine, if_exists="append") + + df.hgnc_id = pd.to_numeric(df.hgnc_id.str.split(":").str[1]) + + for df_col, model, m_col in ( + ("prev_symbol", hgnc.PrevSymbol, None), + ("alias_symbol", hgnc.AliasSymbol, None), + ("alias_name", hgnc.AliasName, None), + ("ccds_id", hgnc.Ccds, "identifier"), + ("ena", hgnc.Ena, "identifier"), + ("enzyme_id", hgnc.Enzyme, "ec_number"), + ("gene_group", hgnc.GeneGroupName, "name"), + ("gene_group_id", hgnc.GeneGroupId, "identifier"), + ("uniprot_ids", hgnc.UniProt, "accession"), + ("rna_central_id", hgnc.RnaCentral, "identifier"), + ("rgd_id", hgnc.Rgd, "identifier"), + ("refseq_accession", hgnc.RefSeq, "accession"), + ("pubmed_id", hgnc.PubMed, "pmid"), + ("prev_name", hgnc.PrevName, None), + ("omim_id", hgnc.Omim, "identifier"), + ("mgd_id", hgnc.Mgd, "identifier"), + ("lsdb", hgnc.Lsdb, "identifier"), + ): + df_1n_table = df[[df_col, "hgnc_id"]].explode(df_col).dropna() if m_col: df_1n_table.rename(columns={df_col: m_col}, inplace=True) - df_1n_table.to_sql( - model.__tablename__, - self.engine, - if_exists='append', - index=False) + df_1n_table.to_sql(model.__tablename__, self.engine, if_exists="append", index=False) return df.shape[0] def import_hgnc(self) -> int: """Import HGNC into OrientDB.""" # if new hgnc is imported all hgnc links should be reset and hgnc table should be empty - self.execute('Update genetic_flow set hgnc=null') - self.execute('Delete from hgnc') + self.execute("Update genetic_flow set hgnc=null") + self.execute("Delete from hgnc") file_path = get_file_path(self.urls[self.biodb_name], self.biodb_name) @@ -135,16 +160,16 @@ def import_hgnc(self) -> int: string_encode = raw_content.encode("ascii", "ignore") # Convert unicode chars to ascii hgnc_content = json.loads(string_encode.decode()) - rows = hgnc_content['response']['docs'] + rows = hgnc_content["response"]["docs"] df = pd.DataFrame(rows) df = self._standardize_dataframe(df) - df.rename(columns={'hgnc_id': 'id'}, inplace=True) + df.rename(columns={"hgnc_id": "id"}, inplace=True) df = df.where(pd.notnull(df), None) sql_temp = "INSERT INTO `hgnc` content {}" - new_entries = df.to_dict('records') + new_entries = df.to_dict("records") - for row in tqdm(new_entries, desc=f'Import {self.biodb_name.upper()}'): + for row in tqdm(new_entries, desc=f"Import {self.biodb_name.upper()}"): sql = sql_temp.format({k: v for k, v in row.items() if v}) try: @@ -162,26 +187,27 @@ def insert_orthologs(self) -> int: # table_name = hgnc.HumanOrtholog.__tablename__ table_name = "human_ortholog" - file_path = get_file_path(self.urls['human_ortholog'], self.biodb_name) + file_path = get_file_path(self.urls["human_ortholog"], self.biodb_name) used_columns = [ - 'hgnc_id', - 'human_entrez_gene', - 'human_ensembl_gene', - 'human_symbol', - 'ortholog_species', - 'ortholog_species_entrez_gene', - 'ortholog_species_ensembl_gene', - 'ortholog_species_db_id', - 'ortholog_species_symbol', - 'support'] - df = pd.read_csv(file_path, sep='\t', low_memory=False, usecols=used_columns) + "hgnc_id", + "human_entrez_gene", + "human_ensembl_gene", + "human_symbol", + "ortholog_species", + "ortholog_species_entrez_gene", + "ortholog_species_ensembl_gene", + "ortholog_species_db_id", + "ortholog_species_symbol", + "support", + ] + df = pd.read_csv(file_path, sep="\t", low_memory=False, usecols=used_columns) df.index += 1 - df.index.rename('id', inplace=True) - df.replace('-', np.nan, inplace=True) - df.to_sql(table_name, self.engine, chunksize=100000, if_exists='append') + df.index.rename("id", inplace=True) + df.replace("-", np.nan, inplace=True) + df.to_sql(table_name, self.engine, chunksize=100000, if_exists="append") return df.shape[0] - def get_basic_entry_by_symbol(self, symbol: str) -> HgncEntry4Update: + def get_basic_entry_by_symbol(self, symbol: str) -> Optional[HgncEntry4Update]: """Return HgncEntry4Update object.""" sql_name = """Select @rid.asString() as hgnc_rid, @@ -190,21 +216,23 @@ def get_basic_entry_by_symbol(self, symbol: str) -> HgncEntry4Update: location, symbol from - hgnc where symbol='{}' limit 1""".format(symbol) + hgnc where symbol='{}' limit 1""".format( + symbol + ) result = self.execute(sql_name) if result: data = result[0].oRecordData - status = data.pop('status') - if status == 'Entry Withdrawn': - data['suggested_corrections'] = f'{status}: Please use correct HGNC symbol' + status = data.pop("status") + if status == "Entry Withdrawn": + data["suggested_corrections"] = f"{status}: Please use correct HGNC symbol" else: - data['suggested_corrections'] = None + data["suggested_corrections"] = None - if 'location' in data: - data['location'] = self.get_location(data['location']) + if "location" in data: + data["location"] = self.get_location(data["location"]) else: - data['location'] = None + data["location"] = None return HgncEntry4Update(**data) @staticmethod @@ -215,31 +243,32 @@ def get_location(location: str) -> dict: # overwrite data['location'] with structured info in dict if locus_found: locus_dict = locus_found.groupdict() - lr = locus_dict['region'] - band = locus_dict['band'] - locus_dict['region'] = int(lr) if lr else lr - locus_dict['band'] = int(band) if band else band + lr = locus_dict["region"] + band = locus_dict["band"] + locus_dict["region"] = int(lr) if lr else lr + locus_dict["band"] = int(band) if band else band location_dict = locus_dict else: - location_dict = {'unknown_schema': location} + location_dict = {"unknown_schema": location} return location_dict def get_bel_symbols_without_hgnc_link(self): """Return set of all gene symbols in database without a link to HGNC.""" sql_symbols = "Select distinct(name) as symbol from bio_object where namespace='HGNC' and hgnc IS NULL" - return {x.oRecordData['symbol'] for x in self.execute(sql_symbols)} + return {x.oRecordData["symbol"] for x in self.execute(sql_symbols)} def get_bel_symbols_all(self): """Return set of all gene symbols in database.""" sql_symbols = "Select distinct(name) as symbol from bio_object where namespace='HGNC'" - return {x.oRecordData['symbol'] for x in self.execute(sql_symbols)} + return {x.oRecordData["symbol"] for x in self.execute(sql_symbols)} def get_correct_symbol(self, symbol: str): """Checks if symbol is valid otherwise checks previsous symbols.""" result_in_symbol = self.session.query(hgnc.Hgnc).filter(hgnc.Hgnc.symbol == symbol).first() if not result_in_symbol: - result_in_prev_symbol = self.session.query(hgnc.PrevSymbol) \ - .filter(hgnc.PrevSymbol.prev_symbol == symbol).first() + result_in_prev_symbol = ( + self.session.query(hgnc.PrevSymbol).filter(hgnc.PrevSymbol.prev_symbol == symbol).first() + ) if result_in_prev_symbol: symbol = result_in_prev_symbol.hgnc.symbol else: @@ -255,8 +284,8 @@ def correct_wrong_symbol(self, symbol, bel_symbols_all: set): sql = f"Select @rid.asString(), bel from bio_object where namespace='HGNC' and name='{symbol}'" for row in self.execute(sql): r = row.oRecordData - rid = r['rid'] - correct_bel = re.sub(rf'(?<=:")({symbol})(?=")', correct_symbol, r['bel']) + rid = r["rid"] + correct_bel = re.sub(rf'(?<=:")({symbol})(?=")', correct_symbol, r["bel"]) sql = f"UPDATE {rid} SET bel='{correct_bel}', name='{correct_symbol}'" self.execute(sql) else: @@ -266,14 +295,14 @@ def correct_wrong_symbol(self, symbol, bel_symbols_all: set): def update_bel(self) -> int: """Update links in protein, rna and gene nodes to HGNC.""" # check if hgnc is in ODB - if not self.execute("Select count(*) as num from hgnc")[0].oRecordData['num']: + if not self.execute("Select count(*) as num from hgnc")[0].oRecordData["num"]: self.import_hgnc() bel_symbols_all = self.get_bel_symbols_all() symbols_without_hgnc = self.get_bel_symbols_without_hgnc_link() hgnc_symbols = {x[0] for x in self.session.query(hgnc.Hgnc.symbol).all()} - for wrong_symbol in (symbols_without_hgnc - hgnc_symbols): + for wrong_symbol in symbols_without_hgnc - hgnc_symbols: self.correct_wrong_symbol(wrong_symbol, bel_symbols_all) new_bel_symbols = self.get_bel_symbols_without_hgnc_link() @@ -284,45 +313,66 @@ def update_bel(self) -> int: updated += 1 return updated - def update_gene(self, hgnc_rid: str, label: str, location: str, hgnc_symbol: str, - suggested_corrections: str) -> int: + def update_gene( + self, + hgnc_rid: str, + label: str, + location: str, + hgnc_symbol: str, + suggested_corrections: str, + ) -> int: """Update genes in OrientDB and returns number of updates.""" - suggest = ", suggested_corrections={{'wrong name': {}}}".format( - suggested_corrections) if suggested_corrections else '' + suggest = ( + ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) + if suggested_corrections + else "" + ) sql_temp = """Update gene set hgnc = {hgnc_rid}, label= {label}, location={location} {suggest} where namespace = 'HGNC' and name = '{hgnc_symbol}'""" - sql = sql_temp.format(hgnc_rid=hgnc_rid, - label=json.dumps(label), - location=json.dumps(location), - hgnc_symbol=hgnc_symbol, - suggest=suggest) + sql = sql_temp.format( + hgnc_rid=hgnc_rid, + label=json.dumps(label), + location=json.dumps(location), + hgnc_symbol=hgnc_symbol, + suggest=suggest, + ) return self.execute(sql)[0] def update_rna(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> int: """Update RNAs in OrientDB and returns number of updates.""" - suggest = ", suggested_corrections={{'wrong name': {}}}".format( - suggested_corrections) if suggested_corrections else '' + suggest = ( + ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) + if suggested_corrections + else "" + ) sql_temp = """Update rna set hgnc = {hgnc_rid},label={label} {suggest} where namespace = 'HGNC' and name = '{hgnc_symbol}'""" - sql = sql_temp.format(hgnc_rid=hgnc_rid, - label=json.dumps(label), - hgnc_symbol=hgnc_symbol, - suggest=suggest) + sql = sql_temp.format( + hgnc_rid=hgnc_rid, + label=json.dumps(label), + hgnc_symbol=hgnc_symbol, + suggest=suggest, + ) return self.execute(sql)[0] def update_protein(self, hgnc_rid: str, label: str, hgnc_symbol: str, suggested_corrections: str) -> int: """Update proteins in OrientDB and returns number of updates.""" - suggest = ", suggested_corrections={{'wrong name': {}}}".format( - suggested_corrections) if suggested_corrections else '' + suggest = ( + ", suggested_corrections={{'wrong name': {}}}".format(suggested_corrections) + if suggested_corrections + else "" + ) sql_temp = """Update protein set hgnc = {hgnc_rid},label={label} {suggest} where namespace = 'HGNC' and name = '{hgnc_symbol}'""" - sql = sql_temp.format(hgnc_rid=hgnc_rid, - label=json.dumps(label), - hgnc_symbol=hgnc_symbol, - suggest=suggest) + sql = sql_temp.format( + hgnc_rid=hgnc_rid, + label=json.dumps(label), + hgnc_symbol=hgnc_symbol, + suggest=suggest, + ) return self.execute(sql)[0] def update_nodes_by_symbol(self, symbol) -> dict: @@ -337,18 +387,25 @@ def update_nodes_by_symbol(self, symbol) -> dict: hgnc_rid=hgnc.hgnc_rid, label=hgnc.label, location=hgnc.location, - suggested_corrections=suggest) + suggested_corrections=suggest, + ) num_update_rnas = self.update_rna( hgnc_symbol=hgnc.symbol, hgnc_rid=hgnc.hgnc_rid, label=hgnc.label, - suggested_corrections=suggest) + suggested_corrections=suggest, + ) num_update_proteins = self.update_protein( hgnc_symbol=hgnc.symbol, hgnc_rid=hgnc.hgnc_rid, label=hgnc.label, - suggested_corrections=suggest) - return {'genes': num_update_genes, 'rnas': num_update_rnas, 'proteins': num_update_proteins} + suggested_corrections=suggest, + ) + return { + "genes": num_update_genes, + "rnas": num_update_rnas, + "proteins": num_update_proteins, + } def get_symbol_entrez_dict(self) -> Dict[str, int]: """Return dictionary with gene symbols as keys and entrez IDs as values.""" diff --git a/ebel/manager/orientdb/biodbs/intact.py b/ebel/manager/orientdb/biodbs/intact.py index 4a75eb0..39f8bc9 100644 --- a/ebel/manager/orientdb/biodbs/intact.py +++ b/ebel/manager/orientdb/biodbs/intact.py @@ -1,19 +1,17 @@ """IntAct module.""" import logging import zipfile -import pandas as pd - -from tqdm import tqdm from typing import Dict + +import pandas as pd from pyorientdb import OrientDB +from tqdm import tqdm -from ebel.tools import get_file_path -from ebel.manager.orientdb.constants import INTACT +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.uniprot import UniProt -from ebel.manager.orientdb import odb_meta, urls, odb_structure - +from ebel.manager.orientdb.constants import INTACT from ebel.manager.rdbms.models import intact, uniprot - +from ebel.tools import get_file_path logger = logging.getLogger(__name__) @@ -28,11 +26,13 @@ def __init__(self, client: OrientDB = None, condition_keyword="Alzheimer"): self.biodb_name = INTACT self.urls = {self.biodb_name: urls.INTACT} self.file_path = get_file_path(urls.INTACT, self.biodb_name) - super().__init__(edges=odb_structure.intact_edges, - indices=odb_structure.intact_indices, - tables_base=intact.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + edges=odb_structure.intact_edges, + indices=odb_structure.intact_indices, + tables_base=intact.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -46,7 +46,6 @@ def _intact_list_to_dict(string_list: str) -> dict: dict_values = dict() split_values = [x.split(":", 1) for x in string_list.split("|")] for values in split_values: - if len(values) == 2: key, val = values[0], values[1] @@ -65,33 +64,34 @@ def insert_data(self) -> Dict[str, int]: zf = zipfile.ZipFile(self.file_path) - usecols = {'#ID(s) interactor A': "int_a_uniprot_id", - 'ID(s) interactor B': "int_b_uniprot_id", - 'Publication Identifier(s)': "pmid", - 'Interaction type(s)': "it", - 'Interaction identifier(s)': "interaction_ids", - 'Confidence value(s)': "confidence_value", - 'Interaction detection method(s)': "dm" - } - - df = pd.read_csv(zf.open('intact.txt'), sep="\t", usecols=usecols.keys()) + usecols = { + "#ID(s) interactor A": "int_a_uniprot_id", + "ID(s) interactor B": "int_b_uniprot_id", + "Publication Identifier(s)": "pmid", + "Interaction type(s)": "it", + "Interaction identifier(s)": "interaction_ids", + "Confidence value(s)": "confidence_value", + "Interaction detection method(s)": "dm", + } + + df = pd.read_csv(zf.open("intact.txt"), sep="\t", usecols=usecols.keys()) df.rename(columns=usecols, inplace=True) - regex_accession = r'uniprotkb:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})' + regex_accession = r"uniprotkb:([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})" df.int_a_uniprot_id = df.int_a_uniprot_id.str.extract(regex_accession)[0] df.int_b_uniprot_id = df.int_b_uniprot_id.str.extract(regex_accession)[0] df = df[(pd.notnull(df.int_a_uniprot_id) & pd.notnull(df.int_b_uniprot_id))] regex_detection_method = r"psi-mi:\"MI:0*(?P\d+)\"\((?P[^)]+)\)" - df = df.join(df.dm.str.extract(regex_detection_method), how='left') - df.drop(columns=['dm'], inplace=True) + df = df.join(df.dm.str.extract(regex_detection_method), how="left") + df.drop(columns=["dm"], inplace=True) df.pmid = df.pmid.str.extract(r"pubmed:(\d+)") regex_interaction_type = r"psi-mi:\"MI:0*(?P\d+)\"\((?P[^)]+)\)" - df = df.join(df.it.str.extract(regex_interaction_type), how='left') - df.drop(columns=['it'], inplace=True) - df.confidence_value = df.confidence_value.str.extract(r'intact-miscore:(\d+(\.\d+)?)')[0] + df = df.join(df.it.str.extract(regex_interaction_type), how="left") + df.drop(columns=["it"], inplace=True) + df.confidence_value = df.confidence_value.str.extract(r"intact-miscore:(\d+(\.\d+)?)")[0] df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) - df.to_sql(intact.Intact.__tablename__, self.engine, if_exists='append') + df.to_sql(intact.Intact.__tablename__, self.engine, if_exists="append") self.session.commit() @@ -116,11 +116,14 @@ def get_create_rid_by_uniprot(self, uniprot_accession: str, uniprot_rid_dict: di nn = self.get_namespace_name_by_uniprot(uniprot_accession) if nn: namespace, name = nn - value_dict = {'name': name, - 'namespace': namespace, - 'pure': True, 'bel': f'p({namespace}:"{name}")', - 'uniprot': uniprot_accession} - uniprot_rid_dict[uniprot_accession] = self.get_create_rid('protein', value_dict, check_for='bel') + value_dict = { + "name": name, + "namespace": namespace, + "pure": True, + "bel": f'p({namespace}:"{name}")', + "uniprot": uniprot_accession, + } + uniprot_rid_dict[uniprot_accession] = self.get_create_rid("protein", value_dict, check_for="bel") return uniprot_rid_dict.get(uniprot_accession) def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: @@ -140,14 +143,14 @@ def get_namespace_name_by_uniprot(self, uniprot_accession: str) -> tuple: sql = f"""Select s.symbol, u.taxid from uniprot u inner join uniprot_gene_symbol s on (u.id=s.uniprot_id) where u.accession='{uniprot_accession}' limit 1""" result = self.engine.execute(sql).fetchone() - taxid_to_namespace = {9606: 'HGNC', 10090: 'MGI', 10116: 'RGD'} + taxid_to_namespace = {9606: "HGNC", 10090: "MGI", 10116: "RGD"} if result: name, taxid = result - namespace = taxid_to_namespace.get(taxid, 'UNIPROT') + namespace = taxid_to_namespace.get(taxid, "UNIPROT") return_value = (namespace, name) else: if self.session.query(uniprot.Uniprot).filter(uniprot.Uniprot.accession == uniprot_accession).first(): - return_value = ('UNIPROT', uniprot_accession) + return_value = ("UNIPROT", uniprot_accession) return return_value def update_interactions(self) -> int: @@ -190,23 +193,37 @@ def update_interactions(self) -> int: sql = sql_temp.format(uniprot_accession=uniprot_accession) result = self.engine.execute(sql) - for up_a, up_b, pmid, ia_ids, ia_type, ia_type_id, d_method, d_method_id, c_value in result.fetchall(): + for ( + up_a, + up_b, + pmid, + ia_ids, + ia_type, + ia_type_id, + d_method, + d_method_id, + c_value, + ) in result.fetchall(): from_rid = self.get_create_rid_by_uniprot(up_a, uniprot_rid_dict) to_rid = self.get_create_rid_by_uniprot(up_b, uniprot_rid_dict) if from_rid and to_rid: - value_dict = {'interaction_type': ia_type, - 'interaction_type_psimi_id': ia_type_id, - 'detection_method': d_method, - 'detection_method_psimi_id': d_method_id, - 'interaction_ids': dict([x.split(':', 1) for x in ia_ids.split('|')]), - 'confidence_value': float(c_value), - 'pmid': pmid} - - self.create_edge(class_name='has_ppi_ia', - from_rid=from_rid, - to_rid=to_rid, - value_dict=value_dict) + value_dict = { + "interaction_type": ia_type, + "interaction_type_psimi_id": ia_type_id, + "detection_method": d_method, + "detection_method_psimi_id": d_method_id, + "interaction_ids": dict([x.split(":", 1) for x in ia_ids.split("|")]), + "confidence_value": float(c_value), + "pmid": pmid, + } + + self.create_edge( + class_name="has_ppi_ia", + from_rid=from_rid, + to_rid=to_rid, + value_dict=value_dict, + ) updated += 1 diff --git a/ebel/manager/orientdb/biodbs/iuphar.py b/ebel/manager/orientdb/biodbs/iuphar.py index 4357e56..5f03d50 100644 --- a/ebel/manager/orientdb/biodbs/iuphar.py +++ b/ebel/manager/orientdb/biodbs/iuphar.py @@ -1,19 +1,18 @@ """IUPHAR drug module.""" import logging +from typing import Dict + import numpy as np import pandas as pd - -from tqdm import tqdm -from typing import Dict from pyorientdb import OrientDB +from tqdm import tqdm -from ebel.tools import get_file_path -from ebel.manager.orientdb.constants import IUPHAR +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.uniprot import UniProt -from ebel.manager.orientdb import odb_meta, urls, odb_structure - +from ebel.manager.orientdb.constants import IUPHAR from ebel.manager.rdbms.models import iuphar +from ebel.tools import get_file_path logger = logging.getLogger(__name__) @@ -25,11 +24,16 @@ def __init__(self, client: OrientDB = None): """Init IUPHAR.""" self.client = client self.biodb_name = IUPHAR - self.urls = {'iuphar_int': urls.IUPHAR_INT, 'iuphar_ligands': urls.IUPHAR_LIGANDS} - super().__init__(edges=odb_structure.iuphar_edges, - tables_base=iuphar.Base, - urls=self.urls, - biodb_name=self.biodb_name) + self.urls = { + "iuphar_int": urls.IUPHAR_INT, + "iuphar_ligands": urls.IUPHAR_LIGANDS, + } + super().__init__( + edges=odb_structure.iuphar_edges, + tables_base=iuphar.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -44,50 +48,71 @@ def insert_data(self) -> Dict[str, int]: ints = self.insert_interaction_data() self.session.commit() - return {'ligands': ligands, 'interactions': ints} + return {"ligands": ligands, "interactions": ints} def insert_ligand_data(self) -> int: """Insert ligand/drug data in generic OrientDB class.""" logger.info("Reading and formatting IUPHAR drug data") - df = pd.read_csv(get_file_path(self.urls['iuphar_ligands'], self.biodb_name), - sep=",", - low_memory=False, - skiprows=[0], - # dtype={'Ligand id': 'Int64', 'PubChem SID': 'Int64', 'PubChem CID': 'Int64'}, - true_values=['yes']).replace({np.nan: None}) # Convert 'yes' to True + df = pd.read_csv( + get_file_path(self.urls["iuphar_ligands"], self.biodb_name), + sep=",", + low_memory=False, + skiprows=[0], + # dtype={'Ligand id': 'Int64', 'PubChem SID': 'Int64', 'PubChem CID': 'Int64'}, + true_values=["yes"], + ).replace( + {np.nan: None} + ) # Convert 'yes' to True df.columns = self._standardize_column_names(df.columns) - df.rename(columns={'in_ch_i': 'inchi', - 'in_ch_ikey': 'inchi_key', - 'pub_chem_cid': 'pubchem_cid', - 'pub_chem_sid': 'pubchem_sid', - 'uni_prot_id': 'uniprot_id', - 'ligand_id': 'id'}, inplace=True) # Make it consistent with drugbank - df.set_index('id', inplace=True) - df.to_sql(iuphar.IupharLigand.__tablename__, self.engine, if_exists='append') + df.rename( + columns={ + "in_ch_i": "inchi", + "in_ch_ikey": "inchi_key", + "pub_chem_cid": "pubchem_cid", + "pub_chem_sid": "pubchem_sid", + "uni_prot_id": "uniprot_id", + "ligand_id": "id", + }, + inplace=True, + ) # Make it consistent with drugbank + df.set_index("id", inplace=True) + df.to_sql(iuphar.IupharLigand.__tablename__, self.engine, if_exists="append") return df.shape[0] def insert_interaction_data(self) -> int: """Insert interaction data in generic OrientDB class.""" logger.info("Reading and formatting IUPHAR interaction data.") - df = pd.read_csv(get_file_path(self.urls['iuphar_int'], self.biodb_name), - sep=",", - low_memory=False, - skiprows=[0], - dtype={'target_id': 'Int64', 'ligand_pubchem_sid': 'Int64', 'ligand_id': 'Int64', - 'target_ligand_id': 'Int64', 'target_ligand_pub_chem_sid': 'Int64'}, - true_values=['t'], - false_values=['f']).replace({np.nan: None}) + df = pd.read_csv( + get_file_path(self.urls["iuphar_int"], self.biodb_name), + sep=",", + low_memory=False, + skiprows=[0], + dtype={ + "target_id": "Int64", + "ligand_pubchem_sid": "Int64", + "ligand_id": "Int64", + "target_ligand_id": "Int64", + "target_ligand_pub_chem_sid": "Int64", + }, + true_values=["t"], + false_values=["f"], + ).replace({np.nan: None}) df.columns = self._standardize_column_names(df.columns) - df.rename(columns={'target_uni_prot_id': "target_uniprot", - 'target_ligand_uni_prot_id': "target_ligand_uniprot_id", - 'target_ligand_pub_chem_sid': "target_ligand_pubchem_sid", - 'ligand_pub_chem_sid': "ligand_pubchem_sid", - 'pub_med_id': "pubmed_id"}, inplace=True) + df.rename( + columns={ + "target_uni_prot_id": "target_uniprot", + "target_ligand_uni_prot_id": "target_ligand_uniprot_id", + "target_ligand_pub_chem_sid": "target_ligand_pubchem_sid", + "ligand_pub_chem_sid": "ligand_pubchem_sid", + "pub_med_id": "pubmed_id", + }, + inplace=True, + ) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(iuphar.IupharInteraction.__tablename__, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(iuphar.IupharInteraction.__tablename__, self.engine, if_exists="append") return df.shape[0] @@ -97,14 +122,14 @@ def update_interactions(self) -> int: uniprot = UniProt(self.client) iuphar_edge_type_mapper = { - 'Agonist': 'agonist_of__iu', - 'Inhibitor': 'inhibits__iu', - 'Antagonist': 'antagonist_of__iu', - 'Channel blocker': 'channel_blocker_of__iu', - 'Allosteric modulator': 'allosteric_modulator_of__iu', - 'Activator': 'activates__iu', - 'Antibody': 'antibody_against__iu', - 'Gating inhibitor': 'inhibits_gating__iu' + "Agonist": "agonist_of__iu", + "Inhibitor": "inhibits__iu", + "Antagonist": "antagonist_of__iu", + "Channel blocker": "channel_blocker_of__iu", + "Allosteric modulator": "allosteric_modulator_of__iu", + "Activator": "activates__iu", + "Antibody": "antibody_against__iu", + "Gating inhibitor": "inhibits_gating__iu", } sql = """select i.pubmed_id, i.assay_description, i.affinity_units, i.affinity_low, i.affinity_median, @@ -114,42 +139,50 @@ def update_interactions(self) -> int: on (i.ligand_id=l.id) where i.target_uniprot IS NOT NULL and pubchem_sid IS NOT NULL""" df_iuphar = pd.read_sql(sql, self.engine).replace({np.nan: None}) - df_iuphar.set_index('target_uniprot', inplace=True) + df_iuphar.set_index("target_uniprot", inplace=True) df_graph = pd.DataFrame( uniprot.get_pure_uniprot_rid_dict_in_bel_context().items(), - columns=['target_uniprot', 'rid']) - df_graph.set_index('target_uniprot', inplace=True) - df_join = df_graph.join(df_iuphar, how='inner') - - for uniprot, data in tqdm(df_join.iterrows(), total=df_join.shape[0], - desc=f"Update {self.biodb_name.upper()} interactions"): - if data.ligand_gene_symbol and data.ligand_species and 'Human' in data.ligand_species: - symbol = data.ligand_gene_symbol.split('|')[0] # human seems to always the first - a_value_dict = {'pure': True, - 'bel': f'p(HGNC:"{symbol}")', - 'namespace': 'HGNC', - 'name': symbol, - } - a_class = 'protein' + columns=["target_uniprot", "rid"], + ) + df_graph.set_index("target_uniprot", inplace=True) + df_join = df_graph.join(df_iuphar, how="inner") + + for uniprot, data in tqdm( + df_join.iterrows(), + total=df_join.shape[0], + desc=f"Update {self.biodb_name.upper()} interactions", + ): + if data.ligand_gene_symbol and data.ligand_species and "Human" in data.ligand_species: + symbol = data.ligand_gene_symbol.split("|")[0] # human seems to always the first + a_value_dict = { + "pure": True, + "bel": f'p(HGNC:"{symbol}")', + "namespace": "HGNC", + "name": symbol, + } + a_class = "protein" else: - a_value_dict = {'pure': True, - 'bel': f'a(PUBCHEM:"{data.pubchem_sid}")', - 'namespace': 'PUBCHEM', - 'name': str(data.pubchem_sid), - 'label': data.ligand_name - } - a_class = 'abundance' - a_rid = self.get_create_rid(a_class, value_dict=a_value_dict, check_for='bel') - - i_value_dict = {'pmids': data.pubmed_id.split('|') if data.pubmed_id else None, - 'assay_description': data.assay_description, - 'affinity_units': data.affinity_units, - 'affinity_low': data.affinity_low, - 'affinity_median': data.affinity_median, - 'affinity_high': data.affinity_high, - 'type': data.type, - 'action': data.action} - edge_class = iuphar_edge_type_mapper.get(data.type, 'iuphar_interaction') + a_value_dict = { + "pure": True, + "bel": f'a(PUBCHEM:"{data.pubchem_sid}")', + "namespace": "PUBCHEM", + "name": str(data.pubchem_sid), + "label": data.ligand_name, + } + a_class = "abundance" + a_rid = self.get_create_rid(a_class, value_dict=a_value_dict, check_for="bel") + + i_value_dict = { + "pmids": data.pubmed_id.split("|") if data.pubmed_id else None, + "assay_description": data.assay_description, + "affinity_units": data.affinity_units, + "affinity_low": data.affinity_low, + "affinity_median": data.affinity_median, + "affinity_high": data.affinity_high, + "type": data.type, + "action": data.action, + } + edge_class = iuphar_edge_type_mapper.get(data.type, "iuphar_interaction") self.create_edge(edge_class, from_rid=a_rid, to_rid=data.rid, value_dict=i_value_dict) # not sure if this is really needed diff --git a/ebel/manager/orientdb/biodbs/kegg.py b/ebel/manager/orientdb/biodbs/kegg.py index 3f9c401..10ad6f9 100644 --- a/ebel/manager/orientdb/biodbs/kegg.py +++ b/ebel/manager/orientdb/biodbs/kegg.py @@ -2,21 +2,21 @@ import os import re -import requests -import pandas as pd import xml.etree.ElementTree as ET - -from tqdm import tqdm +from pathlib import Path from typing import Dict + +import pandas as pd +import requests from pyorientdb import OrientDB +from tqdm import tqdm -from ebel.defaults import DATA_DIR from ebel.config import get_config_value -from ebel.manager.orientdb.constants import KEGG +from ebel.defaults import DATA_DIR +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.hgnc import Hgnc -from ebel.manager.constants import BelPmod -from ebel.manager.orientdb import odb_meta, urls, odb_structure - +from ebel.manager.orientdb.constants import KEGG +from ebel.manager.orientdb.odb_defaults import BelPmod from ebel.manager.rdbms.models import kegg @@ -30,24 +30,29 @@ def __init__(self, client: OrientDB = None): """Init KEGG.""" self.client = client self.biodb_name = KEGG - self.urls = {'kegg_path_list': urls.KEGG_PATH_LIST} + self.urls = {"kegg_path_list": urls.KEGG_PATH_LIST} - # TODO Improve KGML folder creation - self.file_paths = {'kgml': os.path.join(DATA_DIR, self.biodb_name, 'kgml')} - _ = [os.makedirs(bio_db_dir, exist_ok=True) for bio_db_dir in self.file_paths.values()] + self.file_paths = {"kgml": Path(DATA_DIR, self.biodb_name, "kgml")} + self.file_paths["kgml"].mkdir(exist_ok=True, parents=True) - super().__init__(tables_base=kegg.Base, - edges=odb_structure.kegg_edges, - urls=self.urls, - biodb_name=self.biodb_name - ) + super().__init__( + tables_base=kegg.Base, + edges=odb_structure.kegg_edges, + urls=self.urls, + biodb_name=self.biodb_name, + ) species_list = get_config_value(section="KEGG", option="species", value="hsa, dme, mmu, rno") self.species = [species_id.strip() for species_id in species_list.split(",")] # TODO Make so it builds dict from info from KEGG directly. JSON url in urls.py, needs method to parse - self.species_dict = {'hsa': 9606, 'dme': 7227, 'mmu': 10090, 'rno': 10116} - self.tax_namespace_dict = {9606: 'HGNC', 10090: 'MGI', 10116: 'RGD', 7227: 'DMEL'} + self.species_dict = {"hsa": 9606, "dme": 7227, "mmu": 10090, "rno": 10116} + self.tax_namespace_dict = { + 9606: "HGNC", + 10090: "MGI", + 10116: "RGD", + 7227: "DMEL", + } self.species_namespace_dict = {k: self.tax_namespace_dict[v] for k, v in self.species_dict.items()} self.hgnc = Hgnc(self.client) self.bel_rid_cache = {} @@ -65,12 +70,20 @@ def get_entries(kegg_xml) -> dict: """Return all KEGG XML entries.""" root = ET.fromstring(kegg_xml) dict_entry = {} - for entry in root.iter('entry'): + for entry in root.iter("entry"): attribute = entry.attrib - if 'link' in attribute: - dict_entry[attribute['id']] = [attribute['name'].split(' '), attribute['type'], attribute['link']] + if "link" in attribute: + dict_entry[attribute["id"]] = [ + attribute["name"].split(" "), + attribute["type"], + attribute["link"], + ] else: - dict_entry[attribute['id']] = [attribute['name'].split(' '), attribute['type'], 'NA'] + dict_entry[attribute["id"]] = [ + attribute["name"].split(" "), + attribute["type"], + "NA", + ] return dict_entry @@ -90,12 +103,12 @@ def get_relations(kegg_xml: str) -> list: """ root = ET.fromstring(kegg_xml) relations = [] - for rel in root.iter('relation'): + for rel in root.iter("relation"): relation_dict = dict() - relation_dict['entry1'] = rel.attrib['entry1'] - relation_dict['entry2'] = rel.attrib['entry2'] - relation_dict['kegg_int_type'] = rel.attrib['type'] - relation_dict['interaction_type'] = [name.attrib['name'] for name in rel.iter('subtype')] + relation_dict["entry1"] = rel.attrib["entry1"] + relation_dict["entry2"] = rel.attrib["entry2"] + relation_dict["kegg_int_type"] = rel.attrib["type"] + relation_dict["interaction_type"] = [name.attrib["name"] for name in rel.iter("subtype")] relations.append(relation_dict) return relations @@ -104,30 +117,31 @@ def get_relations(kegg_xml: str) -> list: def get_kegg_gene_identifiers(kegg_species_id: str) -> dict: """Return a dict of KEGG genes and their IDs.""" url_kegg_genes = f"http://rest.kegg.jp/list/{kegg_species_id}" - df_kegg_genes = pd.read_csv(url_kegg_genes, - sep='\t', - names=['kegg_id', 'external_id'], - ) - dict_kegg_genes = df_kegg_genes.groupby('kegg_id')['external_id'].apply(list).to_dict() + df_kegg_genes = pd.read_csv( + url_kegg_genes, + sep="\t", + names=["kegg_id", "external_id"], + ) + dict_kegg_genes = df_kegg_genes.groupby("kegg_id")["external_id"].apply(list).to_dict() # Split by ',' or ';' and take the first gene name for kegg_id, values in dict_kegg_genes.items(): - dict_kegg_genes[kegg_id] = re.split(r'[,;]+', values[0])[0] + dict_kegg_genes[kegg_id] = re.split(r"[,;]+", values[0])[0] return dict_kegg_genes def _get_pathway_kgml(self, pathway: str): """Reads KGML pathway file if downloaded, downloads if not there. Returns XML content.""" - kgml_path = os.path.join(self.file_paths['kgml'], pathway) + kgml_path = self.file_paths["kgml"].joinpath(pathway) - if os.path.exists(kgml_path): - with open(kgml_path, 'r') as kgmlf: + if kgml_path.is_file(): + with open(kgml_path, "r") as kgmlf: xml_content = kgmlf.read() else: url_path = f"http://rest.kegg.jp/get/{pathway}/kgml" xml_content = requests.get(url_path).text - with open(kgml_path, 'w') as kgmlf: + with open(kgml_path, "w") as kgmlf: kgmlf.write(xml_content) return xml_content @@ -143,24 +157,29 @@ def insert_data(self) -> Dict[str, int]: gene_tag = f"{kegg_species_identifier}:" url_pathway_list = f"http://rest.kegg.jp/list/pathway/{kegg_species_identifier}" - df_pathways = pd.read_csv(url_pathway_list, sep='\t', names=['path_id', 'path_desc']) - df_pathways[['path_name', 'organism']] = df_pathways.path_desc.str.split(' - ', 1, expand=True) - df_pathways.path_id = df_pathways.path_id.str.split(':').str[1] - df_pathways.drop(columns=['path_desc'], inplace=True) + df_pathways = pd.read_csv(url_pathway_list, sep="\t", names=["path_id", "path_desc"]) + df_pathways[["path_name", "organism"]] = df_pathways.path_desc.str.split(pat=" - ", n=1, expand=True) + # df_pathways.path_id = df_pathways.path_id.str.lstrip( + # kegg_species_identifier + # ) + df_pathways.drop(columns=["path_desc"], inplace=True) desc = f"Inserting KEGG data for KEGG species: {kegg_species_identifier}" - for p in tqdm(df_pathways.to_dict('records'), desc=desc): - xml_content = self._get_pathway_kgml(p['path_id']) + for p in tqdm(df_pathways.to_dict("records"), desc=desc): + xml_content = self._get_pathway_kgml(p["path_id"]) dict_entries = self.get_entries(xml_content) relations = self.get_relations(xml_content) # Currently it only take interactions between proteins/genes. Ignores compounds and links to paths generic_table_rows = [] for relation in relations: - entry_a, entry_b = dict_entries[relation['entry1']], dict_entries[relation['entry2']] - kegg_int_type = relation['kegg_int_type'] - interaction_type = relation['interaction_type'] + entry_a, entry_b = ( + dict_entries[relation["entry1"]], + dict_entries[relation["entry2"]], + ) + kegg_int_type = relation["kegg_int_type"] + interaction_type = relation["interaction_type"] # TODO implement compound for kegg_gene_a in entry_a[0]: @@ -169,41 +188,61 @@ def insert_data(self) -> Dict[str, int]: gene_symbol_a = dict_kegg_genes.get(kegg_gene_a) gene_symbol_b = dict_kegg_genes.get(kegg_gene_b) if gene_symbol_a and gene_symbol_b: - generic_table_rows.append([kegg_species_identifier, kegg_gene_a, gene_symbol_a, - kegg_gene_b, gene_symbol_b, interaction_type, - kegg_int_type, p['path_id'], p['path_name'] - ]) + generic_table_rows.append( + [ + kegg_species_identifier, + kegg_gene_a, + gene_symbol_a, + kegg_gene_b, + gene_symbol_b, + interaction_type, + kegg_int_type, + p["path_id"], + p["path_name"], + ] + ) - df = pd.DataFrame(generic_table_rows, - columns=['kegg_species_id', 'kegg_gene_id_a', 'gene_symbol_a', - 'kegg_gene_id_b', 'gene_symbol_b', 'interaction_type', 'kegg_int_type', - 'pathway_identifier', 'pathway_name']) + df = pd.DataFrame( + generic_table_rows, + columns=[ + "kegg_species_id", + "kegg_gene_id_a", + "gene_symbol_a", + "kegg_gene_id_b", + "gene_symbol_b", + "interaction_type", + "kegg_int_type", + "pathway_identifier", + "pathway_name", + ], + ) dfs.append(df) df_all = pd.concat(dfs) - df_all.explode('interaction_type').to_sql(kegg.Kegg.__tablename__, - self.engine, - if_exists='append', - index=False) + df_all.explode("interaction_type").to_sql( + kegg.Kegg.__tablename__, self.engine, if_exists="append", index=False + ) return {self.biodb_name: df_all.shape[0]} def get_pure_rid(self, class_name, symbol, all_prot_symbol_rid_dict): """Get pure protein node rID with the metadata.""" rid = None - bel_class_mapper = {'protein': 'p', 'rna': 'r'} + bel_class_mapper = {"protein": "p", "rna": "r"} if symbol in all_prot_symbol_rid_dict: rid = all_prot_symbol_rid_dict[symbol] else: correct_symbol = self.hgnc.get_correct_symbol(symbol) if correct_symbol: short_bel_function = bel_class_mapper[class_name] - val_dict = {'name': correct_symbol, - 'namespace': 'HGNC', - 'bel': f'{short_bel_function}(HGNC:"{correct_symbol}")', - 'pure': True} - rid = self.get_create_rid(class_name, val_dict, check_for='bel') + val_dict = { + "name": correct_symbol, + "namespace": "HGNC", + "bel": f'{short_bel_function}(HGNC:"{correct_symbol}")', + "pure": True, + } + rid = self.get_create_rid(class_name, val_dict, check_for="bel") all_prot_symbol_rid_dict[correct_symbol] = rid return rid @@ -213,8 +252,13 @@ def get_create_pure_protein(self, namespace: str, name: str) -> str: if bel in self.bel_rid_cache: rid = self.bel_rid_cache[bel] else: - value_dict = {'bel': bel, 'pure': True, 'namespace': namespace, 'name': name} - rid = self.get_create_rid('protein', value_dict=value_dict, check_for='bel') + value_dict = { + "bel": bel, + "pure": True, + "namespace": namespace, + "name": name, + } + rid = self.get_create_rid("protein", value_dict=value_dict, check_for="bel") self.bel_rid_cache[bel] = rid return rid @@ -226,15 +270,16 @@ def update_interactions(self) -> int: inserted = 0 - pmods = {'dephosphorylation': ('pho', 'decreases', BelPmod.PHO), - 'glycosylation': ('gly', 'increases', BelPmod.GLY), - 'methylation': ('me0', 'increases', BelPmod.ME0), - 'phosphorylation': ('pho', 'increases', BelPmod.PHO), - 'ubiquitination': ('ubi', 'increases', BelPmod.UBI) - } - post_translational_modifications = ','.join([f"'{x}'" for x in pmods.keys()]) + pmods = { + "dephosphorylation": ("pho", "decreases", BelPmod.PHO), + "glycosylation": ("gly", "increases", BelPmod.GLY), + "methylation": ("me0", "increases", BelPmod.ME0), + "phosphorylation": ("pho", "increases", BelPmod.PHO), + "ubiquitination": ("ubi", "increases", BelPmod.UBI), + } + post_translational_modifications = ",".join([f"'{x}'" for x in pmods.keys()]) - species_ids = ','.join([f"'{x}'" for x in self.species]) + species_ids = ",".join([f"'{x}'" for x in self.species]) sql_temp = f"""Select interaction_type, @@ -260,10 +305,28 @@ def update_interactions(self) -> int: for symbol, rid in tqdm(symbol_rids_dict.items(), desc="Update KEGG posttranslational modifications"): sql = sql_temp.format(symbol=symbol, interaction_types=post_translational_modifications) df = pd.read_sql(sql, self.engine) - by = ['interaction_type', 'gene_symbol_a', 'gene_symbol_b', 'kegg_species_id'] + keys = ( + "interaction_type", + "gene_symbol_a", + "gene_symbol_b", + "kegg_species_id", + ) + + grouped_records = dict() + for _, row in df.iterrows(): + index = row[[keys]] + pname = row["pathway_name"] + + if index in grouped_records: + grouped_records[index].add(pname) - for key, pathway_names in df.groupby(by=by).apply(lambda x: set(x.pathway_name)).to_dict().items(): + else: + grouped_records[index] = {pname} + # TODO not working so using the more verbose method above + # grouped_records = df.groupby(by=by).apply(lambda x: set(x.pathway_name)).to_dict() + + for key, pathway_names in grouped_records.items(): interaction_type, subject_name, object_name, kegg_species_id = key namespace = self.species_namespace_dict[kegg_species_id] @@ -274,20 +337,24 @@ def update_interactions(self) -> int: ebel_pmod, effect, bel_pmod = pmods[interaction_type] pmod_bel = f'p({namespace}:"{object_name}",pmod({bel_pmod}))' - pmod_value_dict = {'name': object_name, 'namespace': namespace, 'bel': pmod_bel} + pmod_value_dict = { + "name": object_name, + "namespace": namespace, + "bel": pmod_bel, + } - pmod_protein_rid = self.node_exists('protein', pmod_value_dict, check_for='bel') + pmod_protein_rid = self.node_exists("protein", pmod_value_dict, check_for="bel") if not pmod_protein_rid: - pmod_protein_rid = self.get_create_rid('protein', pmod_value_dict, check_for='bel') - self.create_edge('has_modified_protein', object_rid, pmod_protein_rid) - pmod_rid = self.insert_record('pmod', {'bel': f"pmod({bel_pmod})", 'type': ebel_pmod}) - self.create_edge('has__pmod', pmod_protein_rid, pmod_rid) + pmod_protein_rid = self.get_create_rid("protein", pmod_value_dict, check_for="bel") + self.create_edge("has_modified_protein", object_rid, pmod_protein_rid) + pmod_rid = self.insert_record("pmod", {"bel": f"pmod({bel_pmod})", "type": ebel_pmod}) + self.create_edge("has__pmod", pmod_protein_rid, pmod_rid) edge_class = f"{effect}_{ebel_pmod}_kg" edge_value_dict = { "interaction_type": interaction_type, - "pathway_name": list(pathway_names) + "pathway_name": list(pathway_names), } self.create_edge(edge_class, subject_rid, pmod_protein_rid, edge_value_dict) inserted += 1 diff --git a/ebel/manager/orientdb/biodbs/mirtarbase.py b/ebel/manager/orientdb/biodbs/mirtarbase.py index e7603f7..14eec42 100644 --- a/ebel/manager/orientdb/biodbs/mirtarbase.py +++ b/ebel/manager/orientdb/biodbs/mirtarbase.py @@ -1,15 +1,14 @@ """miRTarBase.""" -import pandas as pd - -from tqdm import tqdm from typing import Dict + +import pandas as pd from pyorientdb import OrientDB +from tqdm import tqdm -from ebel.tools import get_file_path +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import MIRTARBASE -from ebel.manager.orientdb import odb_meta, urls, odb_structure - from ebel.manager.rdbms.models import mirtarbase +from ebel.tools import get_file_path class MirTarBase(odb_meta.Graph): @@ -21,10 +20,12 @@ def __init__(self, client: OrientDB = None): self.biodb_name = MIRTARBASE self.urls = {self.biodb_name: urls.MIRTARBASE} self.file_path = get_file_path(urls.MIRTARBASE, self.biodb_name) - super().__init__(urls=self.urls, - tables_base=mirtarbase.Base, - edges=odb_structure.mirtarbase_edges, - biodb_name=self.biodb_name) + super().__init__( + urls=self.urls, + tables_base=mirtarbase.Base, + edges=odb_structure.mirtarbase_edges, + biodb_name=self.biodb_name, + ) def __len__(self) -> Dict[str, int]: return self.number_of_generics @@ -40,16 +41,21 @@ def insert_data(self) -> Dict[str, int]: df = pd.read_excel(self.file_path) df.columns = self._standardize_column_names(df.columns) df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) - df.to_sql(mirtarbase.Mirtarbase.__tablename__, self.engine, if_exists='append', chunksize=10000) + df.to_sql( + mirtarbase.Mirtarbase.__tablename__, + self.engine, + if_exists="append", + chunksize=10000, + ) return {self.biodb_name: df.shape[0]} def update_interactions(self) -> int: """Update edges with mirtarbase metadata.""" self.clear_edges() - df_symbol_rid = self.get_pure_symbol_rid_df_in_bel_context(class_name='rna', namespace='HGNC') + df_symbol_rid = self.get_pure_symbol_rid_df_in_bel_context(class_name="rna", namespace="HGNC") sql = f"""Select mi_rna, @@ -63,24 +69,28 @@ def update_interactions(self) -> int: species_mi_rna='Homo sapiens' and species_target_gene='Homo sapiens' and support_type in ('Functional MTI', 'Non-Functional MTI')""" - cols = ['mi_rna', 'symbol', 'support_type', 'pmid', 'experiments'] + cols = ["mi_rna", "symbol", "support_type", "pmid", "experiments"] df_mirtarbase = pd.DataFrame(self.engine.execute(sql).fetchall(), columns=cols) - df_mirtarbase.experiments = df_mirtarbase.experiments.str.split('//') - df_join = df_mirtarbase.set_index('symbol').join(df_symbol_rid.set_index('symbol'), how='inner') + df_mirtarbase.experiments = df_mirtarbase.experiments.str.split("//") + df_join = df_mirtarbase.set_index("symbol").join(df_symbol_rid.set_index("symbol"), how="inner") desc = f"Update {self.biodb_name.upper()} interactions" updated = 0 - for protein_rid, row in tqdm(df_join.set_index('rid').iterrows(), total=df_join.shape[0], desc=desc): - mir_data = {'bel': f'm(MIRBASE:"{row.mi_rna}")', - 'name': row.mi_rna, - 'namespace': "MIRBASE", - 'pure': True} - mir_rid = self.get_create_rid('micro_rna', mir_data, check_for='bel') - value_dict = {'support_type': row.support_type, - 'pmid': row.pmid, - 'experiments': row.experiments} - self.create_edge('has_mirgene_target', mir_rid, str(protein_rid), value_dict=value_dict) + for protein_rid, row in tqdm(df_join.set_index("rid").iterrows(), total=df_join.shape[0], desc=desc): + mir_data = { + "bel": f'm(MIRBASE:"{row.mi_rna}")', + "name": row.mi_rna, + "namespace": "MIRBASE", + "pure": True, + } + mir_rid = self.get_create_rid("micro_rna", mir_data, check_for="bel") + value_dict = { + "support_type": row.support_type, + "pmid": row.pmid, + "experiments": row.experiments, + } + self.create_edge("has_mirgene_target", mir_rid, str(protein_rid), value_dict=value_dict) updated += 1 return updated diff --git a/ebel/manager/orientdb/biodbs/ncbi.py b/ebel/manager/orientdb/biodbs/ncbi.py index 91d3897..c395fc9 100644 --- a/ebel/manager/orientdb/biodbs/ncbi.py +++ b/ebel/manager/orientdb/biodbs/ncbi.py @@ -1,15 +1,14 @@ """NCBI module.""" -import pandas as pd +from typing import Dict, Optional -from tqdm import tqdm +import pandas as pd from pyorientdb import OrientDB -from typing import Dict, Optional +from tqdm import tqdm -from ebel.manager.orientdb.constants import NCBI from ebel.manager.orientdb import odb_meta, urls -from ebel.tools import get_file_path, get_standard_name - +from ebel.manager.orientdb.constants import NCBI from ebel.manager.rdbms.models import ncbi +from ebel.tools import get_file_path, get_standard_name class Ncbi(odb_meta.Graph): @@ -19,19 +18,18 @@ def __init__(self, client: OrientDB = None): """Init NcbiGene.""" self.client = client self.biodb_name = NCBI - self.urls = {ncbi.NcbiGeneInfo.__tablename__: urls.NCBI_GENE_INFO, - ncbi.NcbiGeneMim.__tablename__: urls.NCBI_GENE_2MIM, - ncbi.NcbiGeneEnsembl.__tablename__: urls.NCBI_GENE_2ENSEMBL, - ncbi.NcbiGeneGo.__tablename__: urls.NCBI_GENE_2GO, - ncbi.NcbiGenePubmed.__tablename__: urls.NCBI_GENE_2PUBMED, - ncbi.NcbiGeneOrtholog.__tablename__: urls.NCBI_GENE_ORTHOLOG, - ncbi.NcbiMedGenName.__tablename__: urls.NCBI_GENE_MEDGEN_NAMES, - ncbi.NcbiMedGenPmid.__tablename__: urls.NCBI_GENE_MEDGEN_PUBMED, - 'neighbors': urls.NCBI_GENE_NEIGHBORS, - } - super().__init__(urls=self.urls, - tables_base=ncbi.Base, - biodb_name=self.biodb_name) + self.urls = { + ncbi.NcbiGeneInfo.__tablename__: urls.NCBI_GENE_INFO, + ncbi.NcbiGeneMim.__tablename__: urls.NCBI_GENE_2MIM, + ncbi.NcbiGeneEnsembl.__tablename__: urls.NCBI_GENE_2ENSEMBL, + ncbi.NcbiGeneGo.__tablename__: urls.NCBI_GENE_2GO, + ncbi.NcbiGenePubmed.__tablename__: urls.NCBI_GENE_2PUBMED, + ncbi.NcbiGeneOrtholog.__tablename__: urls.NCBI_GENE_ORTHOLOG, + ncbi.NcbiMedGenName.__tablename__: urls.NCBI_GENE_MEDGEN_NAMES, + ncbi.NcbiMedGenPmid.__tablename__: urls.NCBI_GENE_MEDGEN_PUBMED, + "neighbors": urls.NCBI_GENE_NEIGHBORS, + } + super().__init__(urls=self.urls, tables_base=ncbi.Base, biodb_name=self.biodb_name) def __len__(self): return self.session.query(ncbi.NcbiGeneInfo).count() @@ -88,7 +86,7 @@ def _insert_medgen(self) -> int: table_name = ncbi.NcbiMedGenName.__tablename__ file_path_name = get_file_path(self.urls[table_name], self.biodb_name) use_cols_name = ["#CUI", "name", "source", "SUPPRESS"] - df_name = pd.read_csv(file_path_name, usecols=use_cols_name, sep='|').rename_axis('id') + df_name = pd.read_csv(file_path_name, usecols=use_cols_name, sep="|").rename_axis("id") df_name.index += 1 self._standardize_dataframe(df_name) df_name.to_sql(table_name, self.engine, if_exists="append") @@ -96,10 +94,16 @@ def _insert_medgen(self) -> int: table_pmid = ncbi.NcbiMedGenPmid.__tablename__ file_path_pmid = get_file_path(self.urls[table_pmid], self.biodb_name) use_cols_pmid = ["CUI", "PMID"] - dfs_pmid = pd.read_csv(file_path_pmid, index_col='CUI', usecols=use_cols_pmid, sep='|', chunksize=1000000) + dfs_pmid = pd.read_csv( + file_path_pmid, + index_col="CUI", + usecols=use_cols_pmid, + sep="|", + chunksize=1000000, + ) - df_name['ncbi_medgen_name_id'] = df_name.index - df_name_fk = df_name.set_index('cui')[['ncbi_medgen_name_id']] + df_name["ncbi_medgen_name_id"] = df_name.index + df_name_fk = df_name.set_index("cui")[["ncbi_medgen_name_id"]] inserted = 0 for df in dfs_pmid: @@ -123,13 +127,12 @@ def _insert_go(self) -> int: self._standardize_dataframe(df) df.index += 1 - df.index.rename('id', inplace=True) - df.pub_med = df.pub_med.str.split('|') - rename_columns = {'id': "ncbi_gene_go_id", 'pub_med': "pmid"} - df_pub_med = df[['pub_med']].explode('pub_med').dropna() \ - .reset_index().rename(columns=rename_columns) + df.index.rename("id", inplace=True) + df.pub_med = df.pub_med.str.split("|") + rename_columns = {"id": "ncbi_gene_go_id", "pub_med": "pmid"} + df_pub_med = df[["pub_med"]].explode("pub_med").dropna().reset_index().rename(columns=rename_columns) - df.drop(columns=['pub_med'], inplace=True) + df.drop(columns=["pub_med"], inplace=True) df.to_sql(table, self.engine, if_exists="append") df_pub_med.to_sql(table_pmid, self.engine, if_exists="append", index=False) @@ -161,7 +164,7 @@ def _insert_orthologs(self) -> int: :return: Number of inserts """ - use_cols = ['#tax_id', 'GeneID', 'Other_tax_id', 'Other_GeneID'] + use_cols = ["#tax_id", "GeneID", "Other_tax_id", "Other_GeneID"] return self.__insert_table(ncbi.NcbiGeneOrtholog, use_cols, chunksize=1000000) def _insert_neighbors(self) -> Dict[str, int]: @@ -169,12 +172,19 @@ def _insert_neighbors(self) -> Dict[str, int]: :return int: Number of inserts """ - usecols = ['GeneID', 'GeneIDs_on_left', 'GeneIDs_on_right', 'overlapping_GeneIDs'] - file_path = get_file_path(self.urls['neighbors'], self.biodb_name) - - neighbor_types = {'overlapping_gene_ids': ncbi.NcbiGeneOverlapping, - 'gene_ids_on_right': ncbi.NcbiGeneOnRight, - 'gene_ids_on_left': ncbi.NcbiGeneOnLeft} + usecols = [ + "GeneID", + "GeneIDs_on_left", + "GeneIDs_on_right", + "overlapping_GeneIDs", + ] + file_path = get_file_path(self.urls["neighbors"], self.biodb_name) + + neighbor_types = { + "overlapping_gene_ids": ncbi.NcbiGeneOverlapping, + "gene_ids_on_right": ncbi.NcbiGeneOnRight, + "gene_ids_on_left": ncbi.NcbiGeneOnLeft, + } inserted = {neighbor_type: 0 for neighbor_type in neighbor_types} @@ -184,17 +194,13 @@ def _insert_neighbors(self) -> Dict[str, int]: df.columns = [get_standard_name(x) for x in df.columns] for neighbor_type, model in neighbor_types.items(): - df_type = df[['gene_id', neighbor_type]].set_index('gene_id') - ntype = neighbor_type.replace('ids', 'id') + df_type = df[["gene_id", neighbor_type]].set_index("gene_id") + ntype = neighbor_type.replace("ids", "id") df_type.rename(columns={neighbor_type: ntype}, inplace=True) - df_type[ntype] = df_type[ntype].str.split('|') + df_type[ntype] = df_type[ntype].str.split("|") df_type = df_type.explode(ntype) - df_type = df_type[df_type[ntype] != '-'].reset_index() - df_type.to_sql( - model.__tablename__, - self.engine, - if_exists='append', - index=False) + df_type = df_type[df_type[ntype] != "-"].reset_index() + df_type.to_sql(model.__tablename__, self.engine, if_exists="append", index=False) inserted[neighbor_type] += df_type.shape[0] return inserted @@ -205,15 +211,20 @@ def _insert_info_description(self) -> pd.DataFrame: """ table = ncbi.NcbiGeneInfo.__tablename__ file_path = get_file_path(self.urls[table], self.biodb_name) - df = pd.read_csv(file_path, - sep="\t", - usecols=['description'], - dtype={'description': 'string'}) \ - .dropna().drop_duplicates() + df = ( + pd.read_csv( + file_path, + sep="\t", + usecols=["description"], + dtype={"description": "string"}, + ) + .dropna() + .drop_duplicates() + ) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(ncbi.NcbiGeneInfoDescription.__tablename__, self.engine, if_exists='append') - return df.assign(description_id=df.index).set_index(['description']) + df.index.rename("id", inplace=True) + df.to_sql(ncbi.NcbiGeneInfoDescription.__tablename__, self.engine, if_exists="append") + return df.assign(description_id=df.index).set_index(["description"]) def _insert_info_xref(self, dataframe: pd.DataFrame) -> int: """Insert cross references to Gene IDs using Gene Info DataFrame. @@ -222,15 +233,15 @@ def _insert_info_xref(self, dataframe: pd.DataFrame) -> int: :return int: Number of inserts """ inserted = 0 - df = dataframe[['db_xrefs', 'gene_id']][dataframe.db_xrefs != '-'].dropna() + df = dataframe[["db_xrefs", "gene_id"]][dataframe.db_xrefs != "-"].dropna() if not df.empty: - df.set_index('gene_id', inplace=True) - df['db_xrefs'] = df['db_xrefs'].str.split('|') - df = df.explode('db_xrefs') - df[['db', 'dbid']] = df['db_xrefs'].str.split(':', 1, expand=True) - df.drop(columns=['db_xrefs'], inplace=True) - df.reset_index().to_sql('ncbi_gene_info_xref', self.engine, if_exists="append", index=False) + df.set_index("gene_id", inplace=True) + df["db_xrefs"] = df["db_xrefs"].str.split("|") + df = df.explode("db_xrefs") + df[["db", "dbid"]] = df["db_xrefs"].str.split(":", n=1, expand=True) + df.drop(columns=["db_xrefs"], inplace=True) + df.reset_index().to_sql("ncbi_gene_info_xref", self.engine, if_exists="append", index=False) inserted = df.shape[0] return inserted @@ -252,17 +263,32 @@ def _insert_info(self, chunksize: int = 1000000) -> int: # df_type_of_gene.index += 1 # df_type_of_gene.to_sql() - use_cols = {'#tax_id', 'GeneID', 'LocusTag', 'Symbol', 'chromosome', 'description', 'map_location', - 'type_of_gene', 'dbXrefs'} - for df in tqdm(pd.read_csv(file_path, - sep="\t", - low_memory=False, - usecols=use_cols, - chunksize=chunksize), desc=f"Import {self.biodb_name.upper()}"): + use_cols = { + "#tax_id", + "GeneID", + "LocusTag", + "Symbol", + "chromosome", + "description", + "map_location", + "type_of_gene", + "dbXrefs", + } + for df in tqdm( + pd.read_csv( + file_path, + sep="\t", + low_memory=False, + usecols=use_cols, + chunksize=chunksize, + ), + desc=f"Import {self.biodb_name.upper()}", + ): self._standardize_dataframe(df) - df.drop(columns=['db_xrefs']).set_index(['description']).join(df_info_decr).set_index('gene_id').to_sql( - table, self.engine, if_exists="append", index=True) + df.drop(columns=["db_xrefs"]).set_index(["description"]).join(df_info_decr).set_index("gene_id").to_sql( + table, self.engine, if_exists="append", index=True + ) inserted += df.shape[0] self._insert_info_xref(df) diff --git a/ebel/manager/orientdb/biodbs/nsides.py b/ebel/manager/orientdb/biodbs/nsides.py index 1d75d58..50b5909 100644 --- a/ebel/manager/orientdb/biodbs/nsides.py +++ b/ebel/manager/orientdb/biodbs/nsides.py @@ -1,19 +1,19 @@ """NSIDES module.""" -import tarfile import logging -import pandas as pd +import os +import tarfile +from typing import Dict, Optional -from tqdm import tqdm -from typing import Dict +import pandas as pd from pyorientdb import OrientDB +from tqdm import tqdm -from ebel.tools import get_file_path from ebel.constants import RID +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import OFFSIDES, ONSIDES -from ebel.manager.orientdb import odb_meta, urls, odb_structure - from ebel.manager.rdbms.models import nsides +from ebel.tools import get_file_path logger = logging.getLogger(__name__) pd.options.mode.chained_assignment = None @@ -39,13 +39,15 @@ class Nsides(odb_meta.Graph): def __init__(self, client: OrientDB = None): """Init nSIDES.""" self.client = client - self.biodb_name = 'nsides' + self.biodb_name = "nsides" self.urls = {OFFSIDES: urls.OFFSIDES, ONSIDES: urls.ONSIDES} - super().__init__(edges=odb_structure.nsides_edges, - nodes=odb_structure.nsides_nodes, - tables_base=nsides.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + edges=odb_structure.nsides_edges, + nodes=odb_structure.nsides_nodes, + tables_base=nsides.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __len__(self): return self.number_of_generics @@ -76,50 +78,60 @@ def insert_data(self) -> Dict[str, int]: https://github.com/tatonetti-lab/nsides-release/blob/master/release-notes/v0.1.md """ file_path = get_file_path(self.urls[OFFSIDES], self.biodb_name) - offsides_df = pd.read_csv(file_path, low_memory=False) - # Because of repeating header, we have to delete all rows which are equal to the columns - offsides_df = offsides_df[offsides_df.ne(offsides_df.columns).any(1)] - offsides_df.drop_duplicates(inplace=True) - offsides_df["source"] = "offsides" - offsides_df.columns = self._standardize_column_names(offsides_df.columns) - - onsides_df = self.__import_onsides() - combined_df = pd.concat([offsides_df, onsides_df], ignore_index=True, sort=False) - - combined_df.index += 1 - combined_df.index.rename('id', inplace=True) - combined_df.to_sql( + df = pd.read_csv(file_path, low_memory=False) + df.drop_duplicates(inplace=True) + df["source"] = "offsides" + df.columns = self._standardize_column_names(df.columns) + + df.index += 1 + df.index.rename("id", inplace=True) + df[pd.to_numeric(df["condition_meddra_id"], errors="coerce").notnull()].to_sql( nsides.Nsides.__tablename__, self.engine, - if_exists='replace', - chunksize=10000 + if_exists="append", + chunksize=100000, ) - return {self.biodb_name: combined_df.shape[0]} - - def __import_onsides(self): - """Extract OnSIDES CSV file and format into a DF.""" - file_path = get_file_path(self.urls[ONSIDES], self.biodb_name) - with tarfile.open(file_path, "r:*") as tar: - df = pd.read_csv(tar.extractfile("csv/adverse_reactions.csv"), header=0).drop_duplicates() - - df.drop( # Remove columns that aren't needed - ['xml_id', 'Unnamed: 10', "omop_concept_id", "drug_concept_ids", "concept_class_id"], - inplace=True, - axis=1, - ) - - df.rename(columns={ # Rename columns to match OFFSIDES - "rxnorm_ids": "drug_rxnorn_id", - "concept_name": "condition_concept_name", - "meddra_id": "condition_meddra_id", - "ingredients": "drug_concept_name", - }, inplace=True) - - # Keep rows with only 1 ingredient/drug - single_value_mask = df['drug_concept_name'].apply(lambda x: len(x.split(",")) == 1) - pruned_df = df.loc[single_value_mask] - pruned_df["source"] = "onsides" - return pruned_df + return {self.biodb_name: df.shape[0]} + + # TODO: Reimplement, but structure have changed + # def __import_onsides(self) -> Optional[pd.DataFrame]: + # """Extract OnSIDES CSV file and format into a DF.""" + # file_path = get_file_path(self.urls[ONSIDES], self.biodb_name) + # file_folder = os.path.dirname(file_path) + # tar = tarfile.open(file_path, "r:gz") + # folder_in_tar_file = tar.members[0].path + # fd = tar.extractfile(f"{folder_in_tar_file}/adverse_reactions.csv.gz") + # df = pd.read_csv(fd, compression="gzip", encoding="utf-8") + + # df.drop( # Remove columns that aren't needed + # [ + # "xml_id", + # "Unnamed: 10", + # "omop_concept_id", + # "drug_concept_ids", + # "concept_class_id", + # ], + # inplace=True, + # axis=1, + # ) + + # df.rename( + # columns={ # Rename columns to match OFFSIDES + # "rxnorm_ids": "drug_rxnorn_id", + # "pt_meddra_term": "condition_concept_name", + # "pt_meddra_id": "condition_meddra_id", + # "ingredients_names": "drug_concept_name", + # }, + # inplace=True, + # ) + + # # Keep rows with only 1 ingredient/drug + # single_value_mask = df["drug_concept_name"].apply( + # lambda x: len(x.split(",")) == 1 + # ) + # pruned_df = df.loc[single_value_mask] + # pruned_df["source"] = "onsides" + # return pruned_df def update_bel(self) -> int: """Create has_side_effect edges between drugs (drugbank) and side_effects. @@ -127,8 +139,8 @@ def update_bel(self) -> int: RxCUI (https://rxnav.nlm.nih.gov/RxNormAPIs.html#) is used as the mapping identifier. """ self.clear_edges() - self.delete_nodes_with_no_edges('side_effect') - self.delete_nodes_with_no_edges('drug') + self.delete_nodes_with_no_edges("side_effect") + self.delete_nodes_with_no_edges("drug") # TODO: Translate to sqlalchemy query sql_temp = """Select @@ -150,40 +162,44 @@ def update_bel(self) -> int: o.mean_reporting_frequency """ - drugbank_ids = self.query_class('drug', columns=['drugbank_id'], drugbank_id='notnull') - drugbank_id_rids = {d['drugbank_id']: d[RID] for d in drugbank_ids} + drugbank_ids = self.query_class("drug", columns=["drugbank_id"], drugbank_id="notnull") + drugbank_id_rids = {d["drugbank_id"]: d[RID] for d in drugbank_ids} - side_effects = self.query_class('side_effect', columns=['condition_meddra_id']) - side_effect_rids = {d['condition_meddra_id']: d[RID] for d in side_effects} + side_effects = self.query_class("side_effect", columns=["condition_meddra_id"]) + side_effect_rids = {d["condition_meddra_id"]: d[RID] for d in side_effects} updated = 0 - for drugbank_id, drugbank_rid in tqdm(drugbank_id_rids.items(), desc=f'Update {self.biodb_name.upper()}'): + for drugbank_id, drugbank_rid in tqdm(drugbank_id_rids.items(), desc=f"Update {self.biodb_name.upper()}"): for r in self.engine.execute(sql_temp.format(drugbank_id)): - (condition_meddra_id, - condition_concept_name, - prr, - mean_reporting_frequency) = r + ( + condition_meddra_id, + condition_concept_name, + prr, + mean_reporting_frequency, + ) = r if condition_meddra_id not in side_effect_rids: side_effect_rid = self.insert_record( - 'side_effect', + "side_effect", { - 'label': condition_concept_name, - 'condition_meddra_id': condition_meddra_id - } + "label": condition_concept_name, + "condition_meddra_id": condition_meddra_id, + }, ) side_effect_rids[condition_meddra_id] = side_effect_rid value_dict = { - 'prr': prr, - 'mean_reporting_frequency': mean_reporting_frequency + "prr": prr, + "mean_reporting_frequency": mean_reporting_frequency, } side_effect_rid = side_effect_rids[condition_meddra_id] - self.create_edge(class_name='has_side_effect', - from_rid=drugbank_rid, - to_rid=side_effect_rid, - value_dict=value_dict) + self.create_edge( + class_name="has_side_effect", + from_rid=drugbank_rid, + to_rid=side_effect_rid, + value_dict=value_dict, + ) updated += 1 return updated diff --git a/ebel/manager/orientdb/biodbs/pathway_commons.py b/ebel/manager/orientdb/biodbs/pathway_commons.py index c34a8cd..1fcc12b 100644 --- a/ebel/manager/orientdb/biodbs/pathway_commons.py +++ b/ebel/manager/orientdb/biodbs/pathway_commons.py @@ -1,22 +1,21 @@ """PathwayCommons module. Depends on HGNC.""" import warnings -import pandas as pd - -from tqdm import tqdm from typing import Dict + +import pandas as pd from pyorientdb import OrientDB +from tqdm import tqdm from ebel.constants import RID -from ebel.tools import get_file_path +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.hgnc import Hgnc from ebel.manager.orientdb.constants import PATHWAY_COMMONS -from ebel.manager.orientdb import odb_meta, urls, odb_structure - -from ebel.manager.rdbms.models import hgnc, pathway_commons as pc - +from ebel.manager.rdbms.models import hgnc +from ebel.manager.rdbms.models import pathway_commons as pc +from ebel.tools import get_file_path -warnings.filterwarnings("ignore", 'This pattern has match groups') +warnings.filterwarnings("ignore", "This pattern has match groups") class PathwayCommons(odb_meta.Graph): @@ -30,11 +29,13 @@ def __init__(self, client: OrientDB = None): self.urls = {self.biodb_name: self.url} self.file_path = get_file_path(self.url, self.biodb_name) - super().__init__(generics=odb_structure.pathway_commons_generics, - edges=odb_structure.pathway_commons_edges, - tables_base=pc.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + generics=odb_structure.pathway_commons_generics, + edges=odb_structure.pathway_commons_edges, + tables_base=pc.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) self.hgnc = Hgnc(self.client) def __len__(self) -> int: @@ -44,11 +45,7 @@ def __len__(self) -> int: def __repr__(self) -> str: """Represent PathwayCommons Integration as string.""" template = "{{BioDatabase:PathwayCommons}}[url:{url}, edges:{edges}, nodes:{generics}]" - representation = template.format( - url=self.url, - edges=self.number_of_edges, - generics=self.number_of_generics - ) + representation = template.format(url=self.url, edges=self.number_of_edges, generics=self.number_of_generics) return representation def __contains__(self, rs_number: int) -> bool: @@ -57,25 +54,35 @@ def __contains__(self, rs_number: int) -> bool: def insert_data(self) -> Dict[str, int]: """Insert data in generic OrientDB class.""" - usecols = ['PARTICIPANT_A', 'INTERACTION_TYPE', 'PARTICIPANT_B', 'INTERACTION_DATA_SOURCE', - 'INTERACTION_PUBMED_ID', 'PATHWAY_NAMES'] + usecols = [ + "PARTICIPANT_A", + "INTERACTION_TYPE", + "PARTICIPANT_B", + "INTERACTION_DATA_SOURCE", + "INTERACTION_PUBMED_ID", + "PATHWAY_NAMES", + ] df = pd.read_csv(self.file_path, sep="\t", low_memory=True, usecols=usecols) # Because 2 tables are in file, we have to identify where second table starts and slice the dataframe - df = df.iloc[:df[df['PARTICIPANT_A'] == 'PARTICIPANT'].index[0]] + df = df.iloc[: df[df["PARTICIPANT_A"] == "PARTICIPANT"].index[0]] df.columns = self._standardize_column_names(df.columns) - df.pathway_names = df.pathway_names.str.split(';') - df.interaction_data_source = df.interaction_data_source.str.split(';') - df.interaction_pubmed_id = df.interaction_pubmed_id.str.split(';') + df.pathway_names = df.pathway_names.str.split(";") + df.interaction_data_source = df.interaction_data_source.str.split(";") + df.interaction_pubmed_id = df.interaction_pubmed_id.str.split(";") df.index += 1 - df['id'] = df.index + df["id"] = df.index # insert main table - df[['participant_a', 'participant_b', 'interaction_type']].to_sql( + df[["participant_a", "participant_b", "interaction_type"]].to_sql( pc.PathwayCommons.__tablename__, - self.engine, if_exists='append', index=False, chunksize=10000) + self.engine, + if_exists="append", + index=False, + chunksize=10000, + ) # insert pmids, names, sources self.create_pmids_table(df) @@ -91,51 +98,54 @@ def insert_data(self) -> Dict[str, int]: def create_pmids_table(self, df): """Create the Pmid table.""" - df_pmids = df[['id', 'interaction_pubmed_id']].dropna().explode('interaction_pubmed_id') - df_pmids.rename(columns={ - 'id': 'pathway_commons_id', - 'interaction_pubmed_id': 'pmid' - }, inplace=True) - df_pmids.pmid = pd.to_numeric(df_pmids.pmid, errors='coerce') - df_pmids.to_sql(pc.Pmid.__tablename__, - con=self.engine, - index=False, - if_exists='append', - chunksize=10000 - ) + df_pmids = df[["id", "interaction_pubmed_id"]].dropna().explode("interaction_pubmed_id") + df_pmids.rename( + columns={"id": "pathway_commons_id", "interaction_pubmed_id": "pmid"}, + inplace=True, + ) + df_pmids.pmid = pd.to_numeric(df_pmids.pmid, errors="coerce") + df_pmids.to_sql( + pc.Pmid.__tablename__, + con=self.engine, + index=False, + if_exists="append", + chunksize=10000, + ) del df_pmids def create_joining_table_names(self, df, df_pc_names): """Create the joining table for Names.""" - df_pc_names_pc = df[['pathway_names', 'id']].explode('pathway_names').dropna() - df_pc_names_pc.rename(columns={'id': "pathway_commons_id", 'pathway_names': "name"}, inplace=True) - df_pc_names_pc.set_index('name', inplace=True) - df_pc_names['pathway_commons_pathway_name_id'] = df_pc_names.index - df_pc_names.set_index('name', inplace=True) - df_pc_names_pc.join(df_pc_names, how='inner').to_sql( + df_pc_names_pc = df[["pathway_names", "id"]].explode("pathway_names").dropna() + df_pc_names_pc.rename(columns={"id": "pathway_commons_id", "pathway_names": "name"}, inplace=True) + df_pc_names_pc.set_index("name", inplace=True) + df_pc_names["pathway_commons_pathway_name_id"] = df_pc_names.index + df_pc_names.set_index("name", inplace=True) + df_pc_names_pc.join(df_pc_names, how="inner").to_sql( pc.pathway_commons__pathway_name.fullname, con=self.engine, index=False, - if_exists='append', - chunksize=10000 + if_exists="append", + chunksize=10000, ) del df_pc_names_pc del df_pc_names def create_joining_table_sources(self, df, df_pc_sources): """Create the joining table for Source.""" - df_pc_sources_pc = df[['interaction_data_source', 'id']].explode('interaction_data_source').dropna() - df_pc_sources_pc.rename(columns={'id': "pathway_commons_id", 'interaction_data_source': "source"}, - inplace=True) - df_pc_sources_pc.set_index('source', inplace=True) - df_pc_sources['pathway_commons_source_id'] = df_pc_sources.index - df_pc_sources.set_index('source', inplace=True) - df_pc_sources_pc.join(df_pc_sources, how='inner').to_sql( + df_pc_sources_pc = df[["interaction_data_source", "id"]].explode("interaction_data_source").dropna() + df_pc_sources_pc.rename( + columns={"id": "pathway_commons_id", "interaction_data_source": "source"}, + inplace=True, + ) + df_pc_sources_pc.set_index("source", inplace=True) + df_pc_sources["pathway_commons_source_id"] = df_pc_sources.index + df_pc_sources.set_index("source", inplace=True) + df_pc_sources_pc.join(df_pc_sources, how="inner").to_sql( pc.pathway_commons__source.fullname, con=self.engine, index=False, - if_exists='append', - chunksize=10000 + if_exists="append", + chunksize=10000, ) del df_pc_sources_pc del df_pc_sources @@ -143,36 +153,36 @@ def create_joining_table_sources(self, df, df_pc_sources): def create_sources_table(self, df): """Create the Sources table.""" pc_sources = df.interaction_data_source.explode().str.strip().dropna().unique() - df_pc_sources = pd.DataFrame(pc_sources, columns=['source']) + df_pc_sources = pd.DataFrame(pc_sources, columns=["source"]) df_pc_sources.index += 1 - df_pc_sources.index.rename('id', inplace=True) - df_pc_sources.to_sql(pc.Source.__tablename__, self.engine, if_exists='append') + df_pc_sources.index.rename("id", inplace=True) + df_pc_sources.to_sql(pc.Source.__tablename__, self.engine, if_exists="append") return df_pc_sources def create_names_table(self, df): """Create the Names table.""" pc_names = df.pathway_names.explode().str.strip().dropna().unique() - df_pc_names = pd.DataFrame(pc_names, columns=['name']) + df_pc_names = pd.DataFrame(pc_names, columns=["name"]) df_pc_names.index += 1 - df_pc_names.index.rename('id', inplace=True) - df_pc_names.to_sql(pc.PathwayName.__tablename__, self.engine, if_exists='append') + df_pc_names.index.rename("id", inplace=True) + df_pc_names.to_sql(pc.PathwayName.__tablename__, self.engine, if_exists="append") return df_pc_names def get_pathway_name_rid_dict(self) -> Dict[str, str]: """Get dict of pathway names as keys and their rIDs as values.""" pathway_name_rid_dict = {} - number = self.get_number_of_class('pc_pathway_name') + number = self.get_number_of_class("pc_pathway_name") # insert data from rdbms in odb if not exists if number == 0: - for pc_pathway_name, in self.session.query(pc.PathwayName.name).all(): - value_dict = {'name': pc_pathway_name} - rid = self.insert_record('pc_pathway_name', value_dict) + for (pc_pathway_name,) in self.session.query(pc.PathwayName.name).all(): + value_dict = {"name": pc_pathway_name} + rid = self.insert_record("pc_pathway_name", value_dict) pathway_name_rid_dict[pc_pathway_name] = rid # get data from odb else: - pc_pathway_names = self.query_class(class_name='pc_pathway_name', columns=['name']) - pathway_name_rid_dict = {x['name']: x[RID] for x in pc_pathway_names} + pc_pathway_names = self.query_class(class_name="pc_pathway_name", columns=["name"]) + pathway_name_rid_dict = {x["name"]: x[RID] for x in pc_pathway_names} return pathway_name_rid_dict def _get_rid(self, rid, valid, name, pure_symbol_rids_dict): @@ -185,10 +195,15 @@ def _get_rid(self, rid, valid, name, pure_symbol_rids_dict): if name in pure_symbol_rids_dict: _rid = pure_symbol_rids_dict[name] else: - _rid = self.insert_record('protein', {'name': name, - 'namespace': "HGNC", - 'bel': f'p(HGNC:"{name}")', - 'pure': True}) + _rid = self.insert_record( + "protein", + { + "name": name, + "namespace": "HGNC", + "bel": f'p(HGNC:"{name}")', + "pure": True, + }, + ) pure_symbol_rids_dict[name] = _rid return _rid @@ -199,19 +214,20 @@ def update_interactions(self) -> Dict[str, int]: pc_pathway_name_rid_dict = self.get_pathway_name_rid_dict() valid_hgnc_symbols = {x[0] for x in self.session.query(hgnc.Hgnc).with_entities(hgnc.Hgnc.symbol).all()} - cols = ['symbol', 'rid'] + cols = ["symbol", "rid"] pure_symbol_rids_dict = self.hgnc.get_pure_symbol_rids_dict() df_all = pd.DataFrame(pure_symbol_rids_dict.items(), columns=cols) - df_bel = pd.DataFrame(self.hgnc.get_pure_symbol_rids_dict_in_bel_context().items(), - columns=cols) + df_bel = pd.DataFrame(self.hgnc.get_pure_symbol_rids_dict_in_bel_context().items(), columns=cols) # skip here if there is no pure symbols with or without BEL context if any([df_all.empty, df_bel.empty]): return inserted - edge_types = ['controls-transport-of', - 'controls-expression-of', - 'controls-phosphorylation-of'] + edge_types = [ + "controls-transport-of", + "controls-expression-of", + "controls-phosphorylation-of", + ] for edge_type in edge_types: inserted[edge_type] = 0 @@ -219,40 +235,54 @@ def update_interactions(self) -> Dict[str, int]: sql = f"""Select id, participant_a, participant_b from pathway_commons where interaction_type='{edge_type}'""" df_ppi_of = pd.read_sql(sql, self.engine) - df_join = df_ppi_of.set_index('participant_a').join(df_all.set_index('symbol')).rename( - columns={'rid': 'rid_a_all'}).join(df_bel.set_index('symbol')).reset_index().rename( - columns={'rid': 'rid_a_bel', 'index': 'a'}).set_index('participant_b').join( - df_all.set_index('symbol')).rename(columns={'rid': 'rid_b_all'}).join( - df_bel.set_index('symbol')).reset_index().rename(columns={'rid': 'rid_b_bel', 'index': 'b'}).set_index( - 'id') - - df_join['a_is_valid'] = [(x in valid_hgnc_symbols) for x in df_join.a] - df_join['b_is_valid'] = [(x in valid_hgnc_symbols) for x in df_join.b] - - a_or_b_in_bel = (df_join.rid_a_bel.notnull() | df_join.rid_b_bel.notnull()) - both_valid = (df_join.a_is_valid & df_join.b_is_valid) + df_join = ( + df_ppi_of.set_index("participant_a") + .join(df_all.set_index("symbol")) + .rename(columns={"rid": "rid_a_all"}) + .join(df_bel.set_index("symbol")) + .reset_index() + .rename(columns={"rid": "rid_a_bel", "index": "a"}) + .set_index("participant_b") + .join(df_all.set_index("symbol")) + .rename(columns={"rid": "rid_b_all"}) + .join(df_bel.set_index("symbol")) + .reset_index() + .rename(columns={"rid": "rid_b_bel", "index": "b"}) + .set_index("id") + ) + + df_join["a_is_valid"] = [(x in valid_hgnc_symbols) for x in df_join.a] + df_join["b_is_valid"] = [(x in valid_hgnc_symbols) for x in df_join.b] + + a_or_b_in_bel = df_join.rid_a_bel.notnull() | df_join.rid_b_bel.notnull() + both_valid = df_join.a_is_valid & df_join.b_is_valid df_both = df_join[(a_or_b_in_bel & both_valid)] - for pc_id, row in tqdm(df_both.iterrows(), total=df_both.shape[0], desc=f'Update PC {edge_type}'): - + for pc_id, row in tqdm( + df_both.iterrows(), + total=df_both.shape[0], + desc=f"Update PC {edge_type}", + ): from_rid = self._get_rid(row.rid_a_all, row.a_is_valid, row.a, pure_symbol_rids_dict) to_rid = self._get_rid(row.rid_b_all, row.b_is_valid, row.b, pure_symbol_rids_dict) if from_rid and to_rid: pathways, pmids, sources = self.get_pathway_pmids_sources(pc_id, pc_pathway_name_rid_dict) value_dict = { - 'type': edge_type, - 'sources': sources, - 'pmids': pmids, - 'pathways': pathways + "type": edge_type, + "sources": sources, + "pmids": pmids, + "pathways": pathways, } - class_name = edge_type.replace('-', '_') + "_pc" - self.create_edge(class_name=class_name, - from_rid=from_rid, - to_rid=to_rid, - value_dict=value_dict, - ignore_empty_values=True) + class_name = edge_type.replace("-", "_") + "_pc" + self.create_edge( + class_name=class_name, + from_rid=from_rid, + to_rid=to_rid, + value_dict=value_dict, + ignore_empty_values=True, + ) inserted[edge_type] += 1 return inserted diff --git a/ebel/manager/orientdb/biodbs/protein_atlas.py b/ebel/manager/orientdb/biodbs/protein_atlas.py index 57fe90e..53481b5 100644 --- a/ebel/manager/orientdb/biodbs/protein_atlas.py +++ b/ebel/manager/orientdb/biodbs/protein_atlas.py @@ -1,15 +1,14 @@ """Protein Atlas module.""" -import pandas as pd +from typing import Dict, Optional +import pandas as pd from pyorientdb import OrientDB -from typing import Dict, Optional +from tqdm import tqdm -from ebel.tools import get_file_path from ebel.manager.orientdb import odb_meta, urls from ebel.manager.orientdb.constants import PROTEIN_ATLAS - from ebel.manager.rdbms.models import protein_atlas -from tqdm import tqdm +from ebel.tools import get_file_path class ProteinAtlas(odb_meta.Graph): @@ -27,16 +26,17 @@ def __init__(self, client: OrientDB = None): protein_atlas.ProteinAtlasRnaBrainFantom.__tablename__: urls.PROTEIN_ATLAS_RNA_FANTOM_BRAIN, protein_atlas.ProteinAtlasRnaMouseBrainAllen.__tablename__: urls.PROTEIN_ATLAS_RNA_MOUSE_BRAIN_ALLEN, } - super().__init__(urls=self.urls, - tables_base=protein_atlas.Base, - biodb_name=self.biodb_name) + super().__init__(urls=self.urls, tables_base=protein_atlas.Base, biodb_name=self.biodb_name) def __len__(self): return self.session.query(protein_atlas.ProteinAtlasNormalTissue).count() def __contains__(self, ensembl_id) -> bool: - count = self.session.query(protein_atlas.ProteinAtlasNormalTissue).filter( - protein_atlas.ProteinAtlasNormalTissue.gene == ensembl_id).count() + count = ( + self.session.query(protein_atlas.ProteinAtlasNormalTissue) + .filter(protein_atlas.ProteinAtlasNormalTissue.gene == ensembl_id) + .count() + ) return bool(count) def insert_data(self) -> Dict[str, int]: @@ -73,7 +73,7 @@ def __insert_table(self, model, use_cols=None, sep: str = "\t", chunksize: Optio for df in dfs: self._standardize_dataframe(df) df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) df.to_sql(table, self.engine, if_exists="append") number_of_inserts += df.shape[0] @@ -123,14 +123,18 @@ def _insert_rna_mouse_brain_allen(self) -> int: def get_tissues_by_ensembl_id(self, ensembl_gene_id): """Return tissues for a given ensembl ID.""" - level_exixs = protein_atlas.ProteinAtlasNormalTissue.level.in_(['Medium', 'High', 'Low']) + level_exixs = protein_atlas.ProteinAtlasNormalTissue.level.in_(["Medium", "High", "Low"]) columns = ( protein_atlas.ProteinAtlasNormalTissue.tissue, protein_atlas.ProteinAtlasNormalTissue.cell_type, - protein_atlas.ProteinAtlasNormalTissue.level + protein_atlas.ProteinAtlasNormalTissue.level, ) - data = [x for x in self.session.query(*columns).filter(level_exixs).filter_by(gene=ensembl_gene_id, - reliability='Approved')] + data = [ + x + for x in self.session.query(*columns) + .filter(level_exixs) + .filter_by(gene=ensembl_gene_id, reliability="Approved") + ] tissues = {} for d in data: if d.tissue not in tissues: @@ -160,26 +164,28 @@ def update_interactions(self) -> int: both('bel_relation').size()>=1 and hgnc.ensembl_gene_id IS NOT NULL""" - rid_ensembl_gene_ids = {x.oRecordData['ensembl_gene_id']: x for x in self.execute(match)} + rid_ensembl_gene_ids = {x.oRecordData["ensembl_gene_id"]: x for x in self.execute(match)} self.execute("Delete EDGE has_located_protein where levels IS NOT NULL") - location_rid_cache = {x['bel']: x['rid'] for x in self.query_class('location', columns=['bel'])} + location_rid_cache = {x["bel"]: x["rid"] for x in self.query_class("location", columns=["bel"])} for ensembl_gene_id, data in tqdm(rid_ensembl_gene_ids.items()): ns_location = "PROTEIN_ATLAS" pure_protein = data.oRecordData - ns = pure_protein['namespace'] - name = pure_protein['name'] - - value_dict = {'namespace': ns, - 'name': name, - 'hgnc': pure_protein['hgnc_id'], - 'involved_genes': [pure_protein['name']], - 'involved_other': [], - 'species': 9606, - 'uniprot': pure_protein.get('uniprot'), - 'label': pure_protein.get('label')} + ns = pure_protein["namespace"] + name = pure_protein["name"] + + value_dict = { + "namespace": ns, + "name": name, + "hgnc": pure_protein["hgnc_id"], + "involved_genes": [pure_protein["name"]], + "involved_other": [], + "species": 9606, + "uniprot": pure_protein.get("uniprot"), + "label": pure_protein.get("label"), + } tissues = self.get_tissues_by_ensembl_id(ensembl_gene_id) @@ -189,26 +195,28 @@ def update_interactions(self) -> int: bel = f'p({ns}:"{name}",{location_bel})' value_dict_located.update(bel=bel) protein_located_rid = self.get_create_rid( - class_name='protein', - value_dict=value_dict_located, - check_for='bel' + class_name="protein", value_dict=value_dict_located, check_for="bel" ) - self.create_edge(class_name='has_located_protein', - from_rid=pure_protein['rid'], - to_rid=protein_located_rid, - value_dict={'levels': levels}) + self.create_edge( + class_name="has_located_protein", + from_rid=pure_protein["rid"], + to_rid=protein_located_rid, + value_dict={"levels": levels}, + ) if location_bel in location_rid_cache: location_rid = location_rid_cache[location_bel] else: - location_rid = self.get_create_rid(class_name='location', - value_dict={ - 'namespace': ns_location, - 'name': tissue, - 'bel': location_bel - }, - check_for='bel') + location_rid = self.get_create_rid( + class_name="location", + value_dict={ + "namespace": ns_location, + "name": tissue, + "bel": location_bel, + }, + check_for="bel", + ) location_rid_cache[location_bel] = location_rid self.create_edge("has__location", from_rid=protein_located_rid, to_rid=location_rid) diff --git a/ebel/manager/orientdb/biodbs/reactome.py b/ebel/manager/orientdb/biodbs/reactome.py index f0ee478..71e89a2 100644 --- a/ebel/manager/orientdb/biodbs/reactome.py +++ b/ebel/manager/orientdb/biodbs/reactome.py @@ -1,17 +1,16 @@ """Reactome.""" import json -import pandas as pd - -from tqdm import tqdm from typing import Dict + +import pandas as pd from pyorientdb import OrientDB from sqlalchemy import distinct +from tqdm import tqdm -from ebel.tools import get_file_path +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.constants import REACTOME -from ebel.manager.orientdb import odb_meta, urls, odb_structure - from ebel.manager.rdbms.models import reactome +from ebel.tools import get_file_path class Reactome(odb_meta.Graph): @@ -22,20 +21,22 @@ def __init__(self, client: OrientDB = None): self.client = client self.biodb_name = REACTOME self.urls = {self.biodb_name: urls.REACTOME} - super().__init__(nodes=odb_structure.reactome_nodes, - tables_base=reactome.Base, - urls=self.urls, - biodb_name=self.biodb_name) + super().__init__( + nodes=odb_structure.reactome_nodes, + tables_base=reactome.Base, + urls=self.urls, + biodb_name=self.biodb_name, + ) def __repr__(self): """Represent the class.""" template = "{{BioDatabase:{class_name}}}[url:{url}, nodes_with_reactome:{num_reactome}, generics:{generics}]" - num = self.query_get_dict("Select count(*) from V where reactome_pathways is not null")[0]['count'] + num = self.query_get_dict("Select count(*) from V where reactome_pathways is not null")[0]["count"] representation = template.format( class_name=self.__class__.__name__, url=self.urls, num_reactome=num, - generics={k: v for k, v in self.number_of_generics.items() if v} + generics={k: v for k, v in self.number_of_generics.items() if v}, ) return representation @@ -47,14 +48,25 @@ def __contains__(self, reactome_id: str) -> bool: def insert_data(self) -> Dict[str, int]: """Insert data into OrientDB database.""" - columns = ['uniprot_accession', 'identifier', 'url', 'name', 'evidence_type', 'organism'] + columns = [ + "uniprot_accession", + "identifier", + "url", + "name", + "evidence_type", + "organism", + ] # evidence_type are: # 1. IEA:= Inferred from Electronic Annotation # 2. TAS:= Traceable Author Statement - df = pd.read_csv(get_file_path(self.urls[self.biodb_name], self.biodb_name), - sep="\t", names=columns, usecols=[0, 1, 3, 4, 5]) + df = pd.read_csv( + get_file_path(self.urls[self.biodb_name], self.biodb_name), + sep="\t", + names=columns, + usecols=[0, 1, 3, 4, 5], + ) df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) df.name = df.name.str.strip() df.to_sql(reactome.Reactome.__tablename__, self.engine, if_exists="append") @@ -67,17 +79,21 @@ def update_interactions(self) -> int: self.execute("Update protein set reactome_pathways = null") sql_update = "Update protein set reactome_pathways = {} where uniprot = '{}' and pure=true" - sql_accessions = "Select distinct(uniprot) as accession_id from protein " \ - "where uniprot IS NOT NULL and pure=true" + sql_accessions = ( + "Select distinct(uniprot) as accession_id from protein " "where uniprot IS NOT NULL and pure=true" + ) proteins = self.execute(sql_accessions) updated = 0 if proteins: for unipid_acc in tqdm(proteins, desc="Update Reactome info in pure proteins."): - accession_id = unipid_acc.oRecordData['accession_id'] - results = self.session.query(distinct(reactome.Reactome.name)).filter( - reactome.Reactome.uniprot_accession == accession_id).all() + accession_id = unipid_acc.oRecordData["accession_id"] + results = ( + self.session.query(distinct(reactome.Reactome.name)) + .filter(reactome.Reactome.uniprot_accession == accession_id) + .all() + ) if results: reactome_pathways = json.dumps([x[0] for x in results]) updated += self.execute(sql_update.format(reactome_pathways, accession_id))[0] diff --git a/ebel/manager/orientdb/biodbs/stringdb.py b/ebel/manager/orientdb/biodbs/stringdb.py index 99cdf4f..a92c005 100644 --- a/ebel/manager/orientdb/biodbs/stringdb.py +++ b/ebel/manager/orientdb/biodbs/stringdb.py @@ -1,19 +1,18 @@ """STRING module.""" import logging +from collections import namedtuple +from typing import Dict + import numpy as np import pandas as pd - -from tqdm import tqdm -from typing import Dict from pyorientdb import OrientDB -from collections import namedtuple +from tqdm import tqdm -from ebel.tools import get_file_path -from ebel.manager.orientdb.constants import STRINGDB +from ebel.manager.orientdb import odb_meta, odb_structure, urls from ebel.manager.orientdb.biodbs.hgnc import Hgnc -from ebel.manager.orientdb import odb_meta, urls, odb_structure - +from ebel.manager.orientdb.constants import STRINGDB from ebel.manager.rdbms.models import stringdb +from ebel.tools import get_file_path logger = logging.getLogger(__name__) @@ -28,13 +27,17 @@ def __init__(self, client: OrientDB = None): self.table_action = stringdb.StringDbAction.__tablename__ self.table_strdb = stringdb.StringDb.__tablename__ self.table_protein = stringdb.StringDbProtein.__tablename__ - self.urls = {self.table_strdb: urls.STRING_INTS, - self.table_action: urls.STRING_ACTIONS, - self.table_protein: urls.STRING_NAMES} - super().__init__(edges=odb_structure.stringdb_edges, - urls=self.urls, - tables_base=stringdb.Base, - biodb_name=self.biodb_name) + self.urls = { + self.table_strdb: urls.STRING_INTS, + self.table_action: urls.STRING_ACTIONS, + self.table_protein: urls.STRING_NAMES, + } + super().__init__( + edges=odb_structure.stringdb_edges, + urls=self.urls, + tables_base=stringdb.Base, + biodb_name=self.biodb_name, + ) def __len__(self) -> dict: """Get number of 'biogrid_interaction' graph edges.""" @@ -56,34 +59,51 @@ def insert_data(self) -> Dict[str, int]: def insert_link_data(self) -> int: """Insert link STRINGDB information into RDBMS.""" - logger.info(f'Insert {self.biodb_name} link data in RDBMS.') + logger.info(f"Insert {self.biodb_name} link data in RDBMS.") file_path = get_file_path(self.urls[self.table_protein], self.biodb_name) - df_protein = pd.read_csv(file_path, sep='\t', usecols=['protein_external_id', 'preferred_name']) + df_protein = pd.read_csv(file_path, sep="\t", usecols=["#string_protein_id", "preferred_name"]).rename( + columns={"#string_protein_id": "string_protein_id"} + ) # Define column types to improve memory efficiency - cols = ['protein1', 'protein2', 'neighborhood', 'neighborhood_transferred', 'fusion', 'cooccurence', - 'homology', 'coexpression', 'coexpression_transferred', 'experiments', 'experiments_transferred', - 'database', 'database_transferred', 'textmining', 'textmining_transferred', 'combined_score'] + cols = [ + "protein1", + "protein2", + "neighborhood", + "neighborhood_transferred", + "fusion", + "cooccurence", + "homology", + "coexpression", + "coexpression_transferred", + "experiments", + "experiments_transferred", + "database", + "database_transferred", + "textmining", + "textmining_transferred", + "combined_score", + ] col_types = dict() for col in cols: - if col.startswith('protein'): - col_types[col] = 'object' + if col.startswith("protein"): + col_types[col] = "object" else: - col_types[col] = 'uint16' + col_types[col] = "uint16" file_path = get_file_path(self.urls[self.table_strdb], self.biodb_name) df_link = pd.read_csv(file_path, dtype=col_types, sep=" ") - df = df_link.set_index('protein1').join(df_protein.set_index('protein_external_id'), how='inner') + df = df_link.set_index("protein1").join(df_protein.set_index("string_protein_id"), how="inner") df.reset_index(inplace=True) - df.rename(columns={'index': 'protein1', 'preferred_name': 'symbol1'}, inplace=True) - df = df.set_index('protein2').join(df_protein.set_index('protein_external_id'), how='inner') + df.rename(columns={"index": "protein1", "preferred_name": "symbol1"}, inplace=True) + df = df.set_index("protein2").join(df_protein.set_index("string_protein_id"), how="inner") df.reset_index(inplace=True) - df.rename(columns={'index': 'protein2', 'preferred_name': 'symbol2'}, inplace=True) + df.rename(columns={"index": "protein2", "preferred_name": "symbol2"}, inplace=True) df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) df.to_sql(self.table_strdb, self.engine, if_exists="append", chunksize=10000) @@ -92,37 +112,46 @@ def insert_link_data(self) -> int: def insert_protein_data(self) -> int: """Generates a dictionary of STRINGDB identifiers as keys and HGNC symbols as values.""" file_path = get_file_path(self.urls[self.table_protein], self.biodb_name) - df = pd.read_csv(file_path, sep='\t', usecols=['protein_external_id', 'preferred_name']) + df = pd.read_csv(file_path, sep="\t", usecols=["#string_protein_id", "preferred_name"]).rename( + columns={"#string_protein_id": "string_protein_id"} + ) df.index += 1 - df.index.rename('id', inplace=True) - df.to_sql(self.table_protein, self.engine, if_exists='append') + df.index.rename("id", inplace=True) + df.to_sql(self.table_protein, self.engine, if_exists="append") return df.shape[0] def insert_action_data(self) -> int: """Insert action STRINGDB information into RDGMS.""" - logger.info(f'Insert {self.biodb_name} action data in RDBMS.') + logger.info(f"Insert {self.biodb_name} action data in RDBMS.") file_path = get_file_path(self.urls[self.table_protein], self.biodb_name) - df_protein = pd.read_csv(file_path, sep='\t', usecols=['protein_external_id', 'preferred_name']) + df_protein = pd.read_csv(file_path, sep="\t", usecols=["#string_protein_id", "preferred_name"]).rename( + columns={"#string_protein_id": "string_protein_id"} + ) file_path = get_file_path(self.urls[self.table_action], self.biodb_name) df_action = pd.read_csv(file_path, sep="\t") # Replace 't' and 'f' values for True/False in appropriate columns - df_action.replace({'is_directional': {'t': True, 'f': False}, - 'a_is_acting': {'t': True, 'f': False}}, inplace=True) + df_action.replace( + { + "is_directional": {"t": True, "f": False}, + "a_is_acting": {"t": True, "f": False}, + }, + inplace=True, + ) # Replace NaN with None df_action.replace({np.nan: None}, inplace=True) - df = df_action.set_index('item_id_a').join(df_protein.set_index('protein_external_id'), how='inner') + df = df_action.set_index("item_id_a").join(df_protein.set_index("string_protein_id"), how="inner") df.reset_index(inplace=True) - df.rename(columns={'index': 'item_id_a', 'preferred_name': 'symbol1'}, inplace=True) - df = df.set_index('item_id_b').join(df_protein.set_index('protein_external_id'), how='inner') + df.rename(columns={"index": "item_id_a", "preferred_name": "symbol1"}, inplace=True) + df = df.set_index("item_id_b").join(df_protein.set_index("string_protein_id"), how="inner") df.reset_index(inplace=True) - df.rename(columns={'index': 'item_id_b', 'preferred_name': 'symbol2'}, inplace=True) + df.rename(columns={"index": "item_id_b", "preferred_name": "symbol2"}, inplace=True) df.index += 1 - df.index.rename('id', inplace=True) + df.index.rename("id", inplace=True) df.to_sql(self.table_action, self.engine, if_exists="append", chunksize=10000) @@ -148,22 +177,24 @@ def get_stringdb_symbols(self): def update_stringdb_interactions(self, hgnc: Hgnc) -> int: """Iterate through BEL proteins and adds stringdb edges to existing proteins in KG.""" - columns = ("neighborhood", - "neighborhood_transferred", - "fusion", - "cooccurence", - "homology", - "coexpression", - "coexpression_transferred", - "experiments", - "experiments_transferred", - "database", - "database_transferred", - "textmining", - "textmining_transferred", - "combined_score") - - bel_hgnc_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace='HGNC') + columns = ( + "neighborhood", + "neighborhood_transferred", + "fusion", + "cooccurence", + "homology", + "coexpression", + "coexpression_transferred", + "experiments", + "experiments_transferred", + "database", + "database_transferred", + "textmining", + "textmining_transferred", + "combined_score", + ) + + bel_hgnc_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") bel_hgncs = set(bel_hgnc_rid_dict.keys()) strdb_hgncs = self.get_stringdb_symbols() shared_hgncs = bel_hgncs & strdb_hgncs @@ -173,12 +204,11 @@ def update_stringdb_interactions(self, hgnc: Hgnc) -> int: strdb = stringdb.StringDb - for hgnc_a in tqdm(shared_hgncs, desc='Update has_ppi_st edges'): + for hgnc_a in tqdm(shared_hgncs, desc="Update has_ppi_st edges"): query = self.session.query(strdb) query = query.filter(strdb.symbol1 == hgnc_a, strdb.experiments > 0) for row in query.all(): - sorted_combi = tuple(sorted([row.symbol1, row.symbol2])) if sorted_combi not in already_inserted: @@ -188,10 +218,12 @@ def update_stringdb_interactions(self, hgnc: Hgnc) -> int: to_rid = self.get_create_rid_by_symbol(row.symbol2, bel_hgnc_rid_dict, hgnc) if from_rid and to_rid: - self.create_edge(class_name='has_ppi_st', - from_rid=from_rid, - to_rid=to_rid, - value_dict=value_dict) + self.create_edge( + class_name="has_ppi_st", + from_rid=from_rid, + to_rid=to_rid, + value_dict=value_dict, + ) already_inserted.add(sorted_combi) updated += 1 @@ -217,8 +249,13 @@ def get_create_rid_by_symbol(self, symbol: str, symbol_rid_dict: dict, hgnc: Hgn if symbol not in symbol_rid_dict: symbol = hgnc.get_correct_symbol(symbol) if symbol: - value_dict = {'name': symbol, 'namespace': 'HGNC', 'pure': True, 'bel': f'p(HGNC:"{symbol}")'} - symbol_rid_dict[symbol] = self.get_create_rid('protein', value_dict, check_for='bel') + value_dict = { + "name": symbol, + "namespace": "HGNC", + "pure": True, + "bel": f'p(HGNC:"{symbol}")', + } + symbol_rid_dict[symbol] = self.get_create_rid("protein", value_dict, check_for="bel") return symbol_rid_dict.get(symbol) def update_action_interactions(self, hgnc: Hgnc) -> int: @@ -234,27 +271,29 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: expression | inhibition | decreases-expression-of inhibition | inhibition | inhibits """ - translator = {('expression', None): 'increases_expression_of_st', - ('ptmod', None): 'controls_pmod_of_st', - ('activation', 'activation'): 'activates_st', - ('expression', 'inhibition'): 'decreases_expression_of_st', - ('inhibition', 'inhibition'): 'inhibits_st'} + translator = { + ("expression", None): "increases_expression_of_st", + ("ptmod", None): "controls_pmod_of_st", + ("activation", "activation"): "activates_st", + ("expression", "inhibition"): "decreases_expression_of_st", + ("inhibition", "inhibition"): "inhibits_st", + } - Action = namedtuple('Action', ('symbol1', 'symbol2', 'mode', 'action', 'score')) + Action = namedtuple("Action", ("symbol1", "symbol2", "mode", "action", "score")) - columns = ', '.join(Action._fields) + columns = ", ".join(Action._fields) sql_temp = f"""Select {columns} from {self.table_action} where mode in ('activation', 'inhibition', 'ptmod', 'expression') and (symbol1='{{symbol}}' or symbol2='{{symbol}}') and is_directional=1 and a_is_acting=1""" - symbols_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace='HGNC') + symbols_rid_dict = self.get_pure_symbol_rids_dict_in_bel_context(namespace="HGNC") symbols = tuple(symbols_rid_dict.keys()) already_inserted = set() updated = 0 - for symbol in tqdm(symbols, desc='Update has_action_st edges'): + for symbol in tqdm(symbols, desc="Update has_action_st edges"): rows = self.engine.execute(sql_temp.format(symbol=symbol)) for row in rows.fetchall(): action = Action(*row) @@ -262,17 +301,18 @@ def update_action_interactions(self, hgnc: Hgnc) -> int: sorted_combi = tuple(sorted([action.symbol1, action.symbol2])) if sorted_combi not in already_inserted: - from_rid = self.get_create_rid_by_symbol(action.symbol1, symbols_rid_dict, hgnc) to_rid = self.get_create_rid_by_symbol(action.symbol2, symbols_rid_dict, hgnc) if from_rid and to_rid: class_name = translator[(action.mode, action.action)] - self.create_edge(class_name=class_name, - from_rid=from_rid, - to_rid=to_rid, - value_dict={'score': action.score}) + self.create_edge( + class_name=class_name, + from_rid=from_rid, + to_rid=to_rid, + value_dict={"score": action.score}, + ) already_inserted.update([sorted_combi]) updated += 1 diff --git a/ebel/manager/orientdb/biodbs/uniprot.py b/ebel/manager/orientdb/biodbs/uniprot.py index 5e667fa..bfdfe1d 100644 --- a/ebel/manager/orientdb/biodbs/uniprot.py +++ b/ebel/manager/orientdb/biodbs/uniprot.py @@ -1,22 +1,21 @@ """UniProt module.""" -import os -import re import ftplib import logging -import pandas as pd +import os +import re +from collections import namedtuple +from typing import Dict, List, Tuple, Union -from tqdm import tqdm -from pyorientdb import OrientDB +import pandas as pd from lxml.etree import iterparse -from collections import namedtuple -from typing import List, Dict, Tuple, Union +from pyorientdb import OrientDB +from tqdm import tqdm from ebel.defaults import default_tax_ids -from ebel.tools import gunzip, get_file_path from ebel.manager.orientdb import odb_meta, urls from ebel.manager.orientdb.constants import UNIPROT - from ebel.manager.rdbms.models import uniprot as up +from ebel.tools import get_file_path, gunzip logger = logging.getLogger(__name__) @@ -26,17 +25,21 @@ Organisms = Dict[int, str] Keywords = Dict[int, str] -Xpath = namedtuple('Xpath', ('recommended_name', - 'gene_name', - 'organism_scientific', - 'taxid', - 'function', - 'subcellular_locations', - 'location' - )) - -XML_URL = 'http://uniprot.org/uniprot' -XML_NAMESPACE = f'{{{XML_URL}}}' +Xpath = namedtuple( + "Xpath", + ( + "recommended_name", + "gene_name", + "organism_scientific", + "taxid", + "function", + "subcellular_locations", + "location", + ), +) + +XML_URL = "http://uniprot.org/uniprot" +XML_NAMESPACE = f"{{{XML_URL}}}" XN = {"n": XML_URL} @@ -44,16 +47,16 @@ class UniProtEntry: """Helper class for the import of data into RDMS.""" column_names = [ - 'name', - 'accession', - 'recommended_name', - 'gene_names', - 'taxid', - 'host_taxids', - 'function_id', - 'xref_ids', - 'keyword_ids', - 'subcellular_location_ids' + "name", + "accession", + "recommended_name", + "gene_names", + "taxid", + "host_taxids", + "function_id", + "xref_ids", + "keyword_ids", + "subcellular_location_ids", ] def __init__(self): @@ -85,23 +88,24 @@ def get_data_tuple(self): self.function_id, self.xref_ids, self.keyword_ids, - self.subcellular_location_ids) + self.subcellular_location_ids, + ) class UniProt(odb_meta.Graph): """UniProt Importer and interface.""" columns2import = [ - 'dataset', - 'accession', - 'name', - 'protein', - 'gene', - 'comment', - 'proteinExistence', - 'feature', - 'evidence', - 'dbReference' + "dataset", + "accession", + "name", + "protein", + "gene", + "comment", + "proteinExistence", + "feature", + "evidence", + "dbReference", ] def __init__(self, client: OrientDB = None, tax_ids: list = default_tax_ids): @@ -109,20 +113,22 @@ def __init__(self, client: OrientDB = None, tax_ids: list = default_tax_ids): self.client = client self.biodb_name = UNIPROT self.file_path = get_file_path(urls.UNIPROT_SPROT, self.biodb_name) - self.file_path_gunzipped = self.file_path.split('.gz')[0] - super().__init__(urls={UNIPROT: urls.UNIPROT_SPROT}, - tables_base=up.Base, - biodb_name=self.biodb_name) + self.file_path_gunzipped = self.file_path.split(".gz")[0] + super().__init__( + urls={UNIPROT: urls.UNIPROT_SPROT}, + tables_base=up.Base, + biodb_name=self.biodb_name, + ) self.tax_ids = tax_ids self.xpath_pattern = Xpath( - recommended_name='./n:recommendedName/n:fullName[1]/text()', - gene_name='./n:name/text()', + recommended_name="./n:recommendedName/n:fullName[1]/text()", + gene_name="./n:name/text()", organism_scientific="./n:name[@type='scientific'][1]/text()", taxid="./n:dbReference[@type='NCBI Taxonomy'][1]/@id", function="./n:text/text()", subcellular_locations="./n:subcellular_location/n:location/text()", - location='./n:subcellularLocation/n:location' + location="./n:subcellularLocation/n:location", ) def __contains__(self, accession) -> bool: @@ -133,19 +139,19 @@ def __contains__(self, accession) -> bool: def __len__(self) -> int: """Return number of proteins (nodes) with uniprot accession number.""" sql = "Select count(*) from protein where uniprot IS NOT NULL" - return self.execute(sql)[0].oRecordData['count'] + return self.execute(sql)[0].oRecordData["count"] def insert_data(self) -> Dict[str, int]: """Insert UniProt data depending on NCBI taxonomy identifier.""" dialect = self.session.bind.dialect.name - if dialect == 'mysql': + if dialect == "mysql": self.engine.execute("SET FOREIGN_KEY_CHECKS=0") inserted = self.insert_uniprot() self.add_gene_symbols() self.session.commit() - if dialect == 'mysql': + if dialect == "mysql": self.engine.execute("SET FOREIGN_KEY_CHECKS=1") return {self.biodb_name: inserted} @@ -158,8 +164,9 @@ def add_gene_symbols(self): self._add_fly_symbols() def _delete_gene_symbols(self, taxid) -> None: - self.session.query(up.GeneSymbol).filter(up.GeneSymbol.uniprot.has(up.Uniprot.taxid == taxid)) \ - .delete(synchronize_session=False) + self.session.query(up.GeneSymbol).filter(up.GeneSymbol.uniprot.has(up.Uniprot.taxid == taxid)).delete( + synchronize_session=False + ) self.session.commit() def _add_hgnc_symbols(self) -> int: @@ -167,10 +174,13 @@ def _add_hgnc_symbols(self) -> int: inserted = 0 self._delete_gene_symbols(9606) - df = pd.read_csv(urls.HGNC_TSV, sep='\t', - low_memory=False, usecols=['symbol', 'uniprot_ids']).dropna() + df = pd.read_csv(urls.HGNC_TSV, sep="\t", low_memory=False, usecols=["symbol", "uniprot_ids"]).dropna() - for row in tqdm(df.itertuples(index=False), desc="Add HGNC symbols to UniProt", total=df.shape[0]): + for row in tqdm( + df.itertuples(index=False), + desc="Add HGNC symbols to UniProt", + total=df.shape[0], + ): uniprot = self.session.query(up.Uniprot).filter(up.Uniprot.accession == row.uniprot_ids).scalar() if uniprot: self.session.add(up.GeneSymbol(symbol=row.symbol, uniprot_id=uniprot.id)) @@ -185,9 +195,18 @@ def _add_mgi_symbols(self) -> int: self._delete_gene_symbols(10090) - df = pd.read_csv(urls.UNIPROT_MGI, sep="\t", usecols=[1, 6], names=['symbol', 'uniprot_accession']).dropna() + df = pd.read_csv( + urls.UNIPROT_MGI, + sep="\t", + usecols=[1, 6], + names=["symbol", "uniprot_accession"], + ).dropna() - for row in tqdm(df.itertuples(index=False), desc="Add MGI symbols to UniProt", total=df.shape[0]): + for row in tqdm( + df.itertuples(index=False), + desc="Add MGI symbols to UniProt", + total=df.shape[0], + ): uniprot = self.session.query(up.Uniprot).filter(up.Uniprot.accession == row.uniprot_accession).scalar() if uniprot: @@ -206,14 +225,18 @@ def _add_rgd_symbols(self) -> int: df = pd.read_csv( urls.UNIPROT_RGD, sep="\t", - comment='#', + comment="#", low_memory=False, - usecols=['SYMBOL', 'UNIPROT_ID'] + usecols=["SYMBOL", "UNIPROT_ID"], ).dropna() - df.columns = ['symbol', 'uniprot_accessions'] - df.uniprot_accessions = df.uniprot_accessions.str.split(';') - - for row in tqdm(df.itertuples(index=False), desc="Add RGD symbols to UniProt", total=df.shape[0]): + df.columns = ["symbol", "uniprot_accessions"] + df.uniprot_accessions = df.uniprot_accessions.str.split(";") + + for row in tqdm( + df.itertuples(index=False), + desc="Add RGD symbols to UniProt", + total=df.shape[0], + ): for accession in row.uniprot_accessions: uniprot = self.session.query(up.Uniprot).filter(up.Uniprot.accession == accession).scalar() if uniprot: @@ -227,27 +250,31 @@ def _add_fly_symbols(self) -> int: """Add MGI symbols from genenames.org.""" inserted = 0 ftp_base = "ftp.flybase.org" - ftp_folder = '/releases/current/precomputed_files/genes/' + ftp_folder = "/releases/current/precomputed_files/genes/" ftp = ftplib.FTP(ftp_base) ftp.login("anonymous", "anonymous") files = ftp.nlst(ftp_folder) - file_found = [x for x in files if re.search(r'fbgn_NAseq_Uniprot_fb_\d{4}_\d{2}.tsv.gz', x)] + file_found = [x for x in files if re.search(r"fbgn_NAseq_Uniprot_fb_\d{4}_\d{2}.tsv.gz", x)] if file_found: - url = f'ftp://{ftp_base}{file_found[0]}' + url = f"ftp://{ftp_base}{file_found[0]}" inserted = 0 self._delete_gene_symbols(7227) df = pd.read_csv( url, - comment='#', - sep='\t', + comment="#", + sep="\t", header=None, usecols=[0, 5], - names=['symbol', 'uniprot_accession'] + names=["symbol", "uniprot_accession"], ).dropna() - for row in tqdm(df.itertuples(index=False), desc="Add FlyBase symbols to UniProt", total=df.shape[0]): + for row in tqdm( + df.itertuples(index=False), + desc="Add FlyBase symbols to UniProt", + total=df.shape[0], + ): uniprot = self.session.query(up.Uniprot).filter(up.Uniprot.accession == row.uniprot_accession).scalar() if uniprot: self.session.add(up.GeneSymbol(symbol=row.symbol, uniprot_id=uniprot.id)) @@ -259,16 +286,18 @@ def _add_fly_symbols(self) -> int: def update_bel(self) -> Dict[str, int]: """Update uniprot links in protein nodes to uniprot class.""" updated = dict() - updated['HGNC'] = self._update_proteins('HGNC', 9606) - updated['MGI'] = self._update_proteins('MGI', 10090) - updated['RGD'] = self._update_proteins('RGD', 10116) - updated['UNIPROT'] = self._update_uniprot_proteins() + updated["HGNC"] = self._update_proteins("HGNC", 9606) + updated["MGI"] = self._update_proteins("MGI", 10090) + updated["RGD"] = self._update_proteins("RGD", 10116) + updated["UNIPROT"] = self._update_uniprot_proteins() return updated def _update_protein_node(self, uniprot_accession, recommended_name, name, namespace, taxid) -> int: """Update all proteins using UNIPROT as namespace. Returns numbers of updated.""" - sql = f'Update protein set uniprot = "{uniprot_accession}", label = "{recommended_name}", ' \ - f'species = {taxid} where namespace = "{namespace}" and name = "{name}"' + sql = ( + f'Update protein set uniprot = "{uniprot_accession}", label = "{recommended_name}", ' + f'species = {taxid} where namespace = "{namespace}" and name = "{name}"' + ) return self.execute(sql)[0] def _get_accesssion_recname(self, taxid, gene_symbol) -> Union[Tuple[str, str], None]: @@ -278,8 +307,10 @@ def _get_accesssion_recname(self, taxid, gene_symbol) -> Union[Tuple[str, str], """ # TODO: This is in general a dangerous method because it selects the first accession number, but there could # be more than one - sql = f'Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs ' \ - f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' + sql = ( + f"Select accession, recommended_name from uniprot as u inner join uniprot_gene_symbol as gs " + f'on (u.id=gs.uniprot_id) where u.taxid={taxid} and gs.symbol="{gene_symbol}" limit 1' + ) results = self.engine.execute(sql) return results.fetchone() if results else None @@ -297,8 +328,10 @@ def _update_proteins(self, namespace, taxid) -> int: num_updated = self._update_protein_node(accession, recommended_name, protein.name, namespace, taxid) updated += num_updated else: - err_txt = f'NO MAPPING TO UNIPROT: {namespace}:{protein.name} can not be mapped to uniprot. ' \ - f'Possible reasons: Withdrawn or only predicted.' + err_txt = ( + f"NO MAPPING TO UNIPROT: {namespace}:{protein.name} can not be mapped to uniprot. " + f"Possible reasons: Withdrawn or only predicted." + ) logging.warning(err_txt) return updated @@ -314,8 +347,10 @@ def _update_uniprot_proteins(self) -> int: updated = 0 sql_temp = "Select recommended_name, taxid from uniprot where accession='{}' limit 1" sql_uniprot = 'SELECT distinct(name) as accession from protein WHERE namespace="UNIPROT"' - sql_update = 'Update protein set uniprot = name, label = "{}", species = {} ' \ - 'where namespace = "UNIPROT" and name = "{}"' + sql_update = ( + 'Update protein set uniprot = name, label = "{}", species = {} ' + 'where namespace = "UNIPROT" and name = "{}"' + ) for protein in self.query(sql_uniprot).itertuples(index=False): sql = sql_temp.format(protein.accession) found = self.engine.execute(sql).fetchone() @@ -331,7 +366,9 @@ def _update_uniprot_proteins(self) -> int: updated += num_updated return updated - def __read_linked_tables(self) -> Tuple[Keywords, Organisms, Xrefs, Functions, ScLocations, int]: + def __read_linked_tables( + self, + ) -> Tuple[Keywords, Organisms, Xrefs, Functions, ScLocations, int]: """Return xref, functions and subcellular locations.""" xrefs: Xrefs = {} functions: Functions = {} @@ -342,97 +379,96 @@ def __read_linked_tables(self) -> Tuple[Keywords, Organisms, Xrefs, Functions, S sclocation_index = 0 keywords: Keywords = {} - doc = iterparse(self.file_path_gunzipped, events=('end',), tag=f'{XML_NAMESPACE}entry') + doc = iterparse(self.file_path_gunzipped, events=("end",), tag=f"{XML_NAMESPACE}entry") counter = 0 - logger.info('Read linked UniProt tables') - - for action, elem in tqdm(doc, desc='Get linked UniProt data'): + logger.info("Read linked UniProt tables") + for action, elem in tqdm(doc, desc="Get linked UniProt data"): counter += 1 for child in elem.iterchildren(): + ctag = child.tag[len(XML_NAMESPACE) :] - ctag = child.tag[len(XML_NAMESPACE):] - - if ctag == 'dbReference': - x_type_id = (child.attrib.get('type'), - child.attrib.get('id')) + if ctag == "dbReference": + x_type_id = (child.attrib.get("type"), child.attrib.get("id")) if x_type_id not in xrefs: xref_index += 1 xrefs[x_type_id] = xref_index - elif ctag in ['organismHost', 'organism']: - taxid = int(child.xpath( - self.xpath_pattern.taxid, - namespaces=XN)[0]) + elif ctag in ["organismHost", "organism"]: + taxid = int(child.xpath(self.xpath_pattern.taxid, namespaces=XN)[0]) if taxid not in organisms: - organism_scientific = child.xpath( - self.xpath_pattern.organism_scientific, - namespaces=XN - )[0] + organism_scientific = child.xpath(self.xpath_pattern.organism_scientific, namespaces=XN)[0] organisms[taxid] = organism_scientific elif ctag == "keyword": - keyword_id = int(child.attrib.get('id').split('KW-')[1]) + keyword_id = int(child.attrib.get("id").split("KW-")[1]) if keyword_id not in keywords: keywords[keyword_id] = child.text - elif ctag == 'comment': - ctype = child.attrib.get('type') + elif ctag == "comment": + ctype = child.attrib.get("type") - if ctype == 'function': - func = child.xpath( - self.xpath_pattern.function, - namespaces=XN)[0] + if ctype == "function": + func = child.xpath(self.xpath_pattern.function, namespaces=XN)[0] if func not in functions: function_index += 1 functions[func] = function_index - elif ctype == 'subcellular location': - scls = child.findall(self.xpath_pattern.location, - namespaces=XN) + elif ctype == "subcellular location": + scls = child.findall(self.xpath_pattern.location, namespaces=XN) for scl in [x.text for x in scls]: - if scl not in sclocations: sclocation_index += 1 sclocations[scl] = sclocation_index elem.clear() return keywords, organisms, xrefs, functions, sclocations, counter - def __insert_linked_data(self, keywords: Keywords, organisms: Organisms, xrefs: Xrefs, functions: Functions, - sclocations: ScLocations): + def __insert_linked_data( + self, + keywords: Keywords, + organisms: Organisms, + xrefs: Xrefs, + functions: Functions, + sclocations: ScLocations, + ): """Insert data from file.""" if keywords: - df_kw = pd.DataFrame(keywords.items(), columns=['keywordid', 'keyword_name']) + df_kw = pd.DataFrame(keywords.items(), columns=["keywordid", "keyword_name"]) self._insert_into_database(df_kw, up.Keyword) if organisms: - df_org = pd.DataFrame(organisms.items(), columns=['taxid', 'scientific_name']) + df_org = pd.DataFrame(organisms.items(), columns=["taxid", "scientific_name"]) self._insert_into_database(df_org, up.Organism) if sclocations: - df_sl = pd.DataFrame([x[::-1] for x in sclocations.items()], columns=['id', 'name']) + df_sl = pd.DataFrame([x[::-1] for x in sclocations.items()], columns=["id", "name"]) self._insert_into_database(df_sl, up.SubcellularLocation) if xrefs: - df_xr = pd.DataFrame([(v, k[0], k[1]) for k, v in xrefs.items()], columns=['id', 'db', 'identifier']) + df_xr = pd.DataFrame( + [(v, k[0], k[1]) for k, v in xrefs.items()], + columns=["id", "db", "identifier"], + ) self._insert_into_database(df_xr, up.Xref) if functions: - df_fc = pd.DataFrame([x[::-1] for x in functions.items()], columns=['id', 'description']) + df_fc = pd.DataFrame([x[::-1] for x in functions.items()], columns=["id", "description"]) self._insert_into_database(df_fc, up.Function) def _insert_into_database(self, dataframe, model): logger.info(f"insert into {model.__tablename__}") - dataframe.to_sql(model.__tablename__, - self.engine, - chunksize=100000, - if_exists='append', - index=False) + dataframe.to_sql( + model.__tablename__, + self.engine, + chunksize=100000, + if_exists="append", + index=False, + ) inserted = dataframe.shape[0] del dataframe return inserted @@ -448,7 +484,14 @@ def insert_uniprot(self) -> int: if not os.path.exists(self.file_path_gunzipped): gunzip(self.file_path, self.file_path_gunzipped) - keywords, hosts, xrefs, functions, sclocations, number_of_entries = self.__read_linked_tables() + ( + keywords, + hosts, + xrefs, + functions, + sclocations, + number_of_entries, + ) = self.__read_linked_tables() self.__insert_linked_data(keywords, hosts, xrefs, functions, sclocations) inserted = self.__insert_uniprot_data(xrefs, functions, sclocations, number_of_entries) @@ -462,16 +505,15 @@ def insert_uniprot(self) -> int: @staticmethod def shorten_tag(elem): """Shorten the given tag.""" - return elem.tag[len(XML_NAMESPACE):] + return elem.tag[len(XML_NAMESPACE) :] @staticmethod def get_tag(name): """Get tag based on given name.""" - return f'{XML_NAMESPACE}{name}' + return f"{XML_NAMESPACE}{name}" def __insert_uniprot_data(self, xrefs, functions, sclocations, number_of_entries) -> int: - - doc = iterparse(self.file_path_gunzipped, events=('end',), tag=self.get_tag('entry')) + doc = iterparse(self.file_path_gunzipped, events=("end",), tag=self.get_tag("entry")) counter = 0 @@ -484,88 +526,117 @@ def __insert_uniprot_data(self, xrefs, functions, sclocations, number_of_entries for child in elem.iterchildren(): ctag = self.shorten_tag(child) - if ctag == 'accession' and not uniprot.accession: + if ctag == "accession" and not uniprot.accession: uniprot.accession = child.text - elif ctag == 'name': + elif ctag == "name": uniprot.name = child.text - elif ctag == 'gene': - for cchild in child.iterchildren(tag=self.get_tag('name')): + elif ctag == "gene": + for cchild in child.iterchildren(tag=self.get_tag("name")): uniprot.gene_names.append(cchild.text) - elif ctag == 'organism': - for cchild in child.iterchildren(tag=self.get_tag('dbReference')): - if cchild.attrib.get('type') == 'NCBI Taxonomy': - uniprot.taxid = int(cchild.attrib.get('id')) + elif ctag == "organism": + for cchild in child.iterchildren(tag=self.get_tag("dbReference")): + if cchild.attrib.get("type") == "NCBI Taxonomy": + uniprot.taxid = int(cchild.attrib.get("id")) break - elif ctag == 'protein': - for cchild in child.iterchildren(tag=self.get_tag('recommendedName')): - for ccchild in cchild.iterchildren(tag=self.get_tag('fullName')): + elif ctag == "protein": + for cchild in child.iterchildren(tag=self.get_tag("recommendedName")): + for ccchild in cchild.iterchildren(tag=self.get_tag("fullName")): uniprot.recommended_name = ccchild.text break - elif ctag == 'organismHost': - for cchild in child.iterchildren(tag=self.get_tag('dbReference')): - if cchild.attrib.get('type') == 'NCBI Taxonomy': - uniprot.host_taxids.append(int(cchild.attrib.get('id'))) + elif ctag == "organismHost": + for cchild in child.iterchildren(tag=self.get_tag("dbReference")): + if cchild.attrib.get("type") == "NCBI Taxonomy": + uniprot.host_taxids.append(int(cchild.attrib.get("id"))) - elif ctag == 'comment': - ctype = child.attrib.get('type') + elif ctag == "comment": + ctype = child.attrib.get("type") - if ctype == 'function': - for cchild in child.iterchildren(tag=self.get_tag('text')): + if ctype == "function": + for cchild in child.iterchildren(tag=self.get_tag("text")): uniprot.function_id = functions[cchild.text] break - elif ctype == 'subcellular location': - for cchild in child.iterchildren(tag=self.get_tag('subcellularLocation')): - for ccchild in cchild.iterchildren(tag=self.get_tag('location')): + elif ctype == "subcellular location": + for cchild in child.iterchildren(tag=self.get_tag("subcellularLocation")): + for ccchild in cchild.iterchildren(tag=self.get_tag("location")): uniprot.subcellular_location_ids.append(sclocations[ccchild.text]) - elif ctag == 'dbReference': - xref = child.attrib.get('type'), child.attrib.get('id') + elif ctag == "dbReference": + xref = child.attrib.get("type"), child.attrib.get("id") uniprot.xref_ids.append(xrefs[xref]) elif ctag == "keyword": - keyword_id = int(child.attrib.get('id').split('KW-')[1]) + keyword_id = int(child.attrib.get("id").split("KW-")[1]) uniprot.keyword_ids.append(keyword_id) uniprots.append(uniprot) elem.clear() df = pd.DataFrame([u.get_data_tuple() for u in uniprots], columns=UniProtEntry.column_names) df.index += 1 - df.index.rename('id', inplace=True) - cols_uniprot = ['name', 'accession', 'recommended_name', 'taxid', 'function_id'] + df.index.rename("id", inplace=True) + cols_uniprot = ["name", "accession", "recommended_name", "taxid", "function_id"] logger.info(f"start insert {up.Uniprot.__tablename__}") - df[cols_uniprot].to_sql(up.Uniprot.__tablename__, self.engine, if_exists='append', chunksize=100000) + df[cols_uniprot].to_sql(up.Uniprot.__tablename__, self.engine, if_exists="append", chunksize=100000) - df['uniprot_id'] = df.index + df["uniprot_id"] = df.index logger.info(f"start insert {up.Gene.__tablename__}") - df[['gene_names', 'uniprot_id']].explode('gene_names').dropna().rename(columns={'gene_names': 'name'}) \ - .to_sql(up.Gene.__tablename__, self.engine, if_exists='append', index=False, chunksize=100000) + df[["gene_names", "uniprot_id"]].explode("gene_names").dropna().rename(columns={"gene_names": "name"}).to_sql( + up.Gene.__tablename__, + self.engine, + if_exists="append", + index=False, + chunksize=100000, + ) logger.info(f"start insert {up.uniprot__uniprot_keyword.name}") - df[['keyword_ids', 'uniprot_id']].explode('keyword_ids').dropna() \ - .rename(columns={'keyword_ids': 'uniprot_keyword_id'}) \ - .to_sql(up.uniprot__uniprot_keyword.name, self.engine, if_exists='append', index=False, chunksize=100000) + df[["keyword_ids", "uniprot_id"]].explode("keyword_ids").dropna().rename( + columns={"keyword_ids": "uniprot_keyword_id"} + ).to_sql( + up.uniprot__uniprot_keyword.name, + self.engine, + if_exists="append", + index=False, + chunksize=100000, + ) logger.info(f"start insert {up.uniprot__uniprot_host.name}") - df[['host_taxids', 'uniprot_id']].explode('host_taxids').dropna() \ - .rename(columns={'host_taxids': 'uniprot_organism_id'}) \ - .to_sql(up.uniprot__uniprot_host.name, self.engine, if_exists='append', index=False, chunksize=100000) + df[["host_taxids", "uniprot_id"]].explode("host_taxids").dropna().rename( + columns={"host_taxids": "uniprot_organism_id"} + ).to_sql( + up.uniprot__uniprot_host.name, + self.engine, + if_exists="append", + index=False, + chunksize=100000, + ) logger.info(f"start insert {up.uniprot__uniprot_xref.name}") - df[['xref_ids', 'uniprot_id']].explode('xref_ids').dropna().rename(columns={'xref_ids': 'uniprot_xref_id'}) \ - .to_sql(up.uniprot__uniprot_xref.name, self.engine, if_exists='append', index=False, chunksize=100000) + df[["xref_ids", "uniprot_id"]].explode("xref_ids").dropna().rename( + columns={"xref_ids": "uniprot_xref_id"} + ).to_sql( + up.uniprot__uniprot_xref.name, + self.engine, + if_exists="append", + index=False, + chunksize=100000, + ) logger.info(f"start insert {up.uniprot__uniprot_subcellular_location.name}") - df[['subcellular_location_ids', 'uniprot_id']].explode('subcellular_location_ids').dropna() \ - .rename(columns={'subcellular_location_ids': 'uniprot_subcellular_location_id'}) \ - .to_sql(up.uniprot__uniprot_subcellular_location.name, self.engine, if_exists='append', index=False, - chunksize=100000) + df[["subcellular_location_ids", "uniprot_id"]].explode("subcellular_location_ids").dropna().rename( + columns={"subcellular_location_ids": "uniprot_subcellular_location_id"} + ).to_sql( + up.uniprot__uniprot_subcellular_location.name, + self.engine, + if_exists="append", + index=False, + chunksize=100000, + ) return df.shape[0] diff --git a/ebel/manager/orientdb/constants.py b/ebel/manager/orientdb/constants.py index 7e1b9e4..83e40ab 100644 --- a/ebel/manager/orientdb/constants.py +++ b/ebel/manager/orientdb/constants.py @@ -2,35 +2,35 @@ # Databases -BIOGRID = 'biogrid' -CHEBI = 'chebi' -CLINICAL_TRIALS = 'clinical_trials' -CLINVAR = 'clinvar' -DISGENET = 'disgenet' -DRUGBANK = 'drugbank' -ENSEMBL = 'ensembl' -FLYBASE = 'flybase' -GWAS_CATALOG = 'gwas_catalog' -HGNC = 'hgnc' -EXPRESSION_ATLAS = 'expression_atlas' -INTACT = 'intact' -IUPHAR = 'iuphar' -KEGG = 'kegg' -MGI = 'mgi' -MIRTARBASE = 'mirtarbase' -NCBI = 'ncbi' -OFFSIDES = 'offsides' -ONSIDES = 'onsides' -NSIDES = 'nsides' -PATHWAY_COMMONS = 'pathway_commons' -PROTEIN_ATLAS = 'protein_atlas' -REACTOME = 'reactome' -RGD = 'rgd' -SIDER = 'sider' -STITCH = 'stitch' -STRINGDB = 'stringdb' -UNIPROT = 'uniprot' +BIOGRID = "biogrid" +CHEBI = "chebi" +CLINICAL_TRIALS = "clinical_trials" +CLINVAR = "clinvar" +DISGENET = "disgenet" +DRUGBANK = "drugbank" +ENSEMBL = "ensembl" +FLYBASE = "flybase" +GWAS_CATALOG = "gwas_catalog" +HGNC = "hgnc" +EXPRESSION_ATLAS = "expression_atlas" +INTACT = "intact" +IUPHAR = "iuphar" +KEGG = "kegg" +MGI = "mgi" +MIRTARBASE = "mirtarbase" +NCBI = "ncbi" +OFFSIDES = "offsides" +ONSIDES = "onsides" +NSIDES = "nsides" +PATHWAY_COMMONS = "pathway_commons" +PROTEIN_ATLAS = "protein_atlas" +REACTOME = "reactome" +RGD = "rgd" +SIDER = "sider" +STITCH = "stitch" +STRINGDB = "stringdb" +UNIPROT = "uniprot" # Data Types -NODES = 'nodes' -EDGES = 'edges' +NODES = "nodes" +EDGES = "edges" diff --git a/ebel/manager/orientdb/importer.py b/ebel/manager/orientdb/importer.py index db2eefc..67c8258 100644 --- a/ebel/manager/orientdb/importer.py +++ b/ebel/manager/orientdb/importer.py @@ -1,29 +1,28 @@ """Methods for importing BEL files into OrientDB.""" -import os -import re -import git import json import logging - -from tqdm import tqdm -from typing import Tuple +import os +import re +from collections import OrderedDict, defaultdict, namedtuple from copy import deepcopy -from pyorientdb import OrientDB from datetime import datetime from pathlib import Path +from typing import Tuple + +import git from git.exc import InvalidGitRepositoryError -from collections import OrderedDict, defaultdict, namedtuple +from pyorientdb import OrientDB +from tqdm import tqdm from ebel import tools from ebel.constants import RID -from ebel.manager.orientdb.constants import NODES, EDGES -from ..constants import bel_func_short, normalized_pmod +from ebel.manager.orientdb.constants import EDGES, NODES + +from .odb_defaults import bel_func_short, normalized_pmod -BEL_GIT_ID = namedtuple('BEL_GIT_ID', ['hexsha', 'repo_path', 'origin_url']) -JsonParts = namedtuple('JsonParts', ['document', - 'definitions', - 'statements_and_sets']) +BEL_GIT_ID = namedtuple("BEL_GIT_ID", ["hexsha", "repo_path", "origin_url"]) +JsonParts = namedtuple("JsonParts", ["document", "definitions", "statements_and_sets"]) logger = logging.getLogger(__name__) @@ -54,7 +53,7 @@ def file_is_in_git_repo(path): def _get_bel_rid_cache(self): """Get BEL string OrientDB rid dictionary for all bel nodes.""" sql = "Select @rid.asString(),@class.asString(),bel from bel" - return {(y['bel'], y['class']): y['rid'] for y in [x.oRecordData for x in self.client.command(sql)]} + return {(y["bel"], y["class"]): y["rid"] for y in [x.oRecordData for x in self.client.command(sql)]} def _get_bel_relation_rid_cache(self): """Create a dictionary of all bel_relation edges using their properties as keys and RIDs as values.""" @@ -63,14 +62,22 @@ def _get_bel_relation_rid_cache(self): sql = """SELECT @rid.asString(), out.@rid.asString() as subject_id, in.@rid.asString() as object_id, evidence, annotation, citation.type as citation_type, citation.ref as citation_ref, @class.asString() as relation FROM bel_relation""" - props = ('relation', 'subject_id', 'object_id', 'citation_type', 'citation_ref', 'evidence', 'annotation') + props = ( + "relation", + "subject_id", + "object_id", + "citation_type", + "citation_ref", + "evidence", + "annotation", + ) results = self.client.command(sql) for entry in results: r = entry.oRecordData edge_profile = [] for prop in props: if prop in r: - if prop == 'annotation': + if prop == "annotation": edge_profile.append(json.dumps(r[prop], sort_keys=True)) else: edge_profile.append(r[prop]) @@ -78,7 +85,7 @@ def _get_bel_relation_rid_cache(self): else: edge_profile.append(None) - rel_cache[tuple(edge_profile)] = r['rid'] + rel_cache[tuple(edge_profile)] = r["rid"] return rel_cache @@ -92,7 +99,7 @@ def get_git_info(self) -> BEL_GIT_ID: """ bel_git_id = None - absolute_path = Path(self.file_path) + absolute_path = Path(self.file_path).absolute() if self.file_is_in_git_repo(absolute_path): repo = git.Repo(absolute_path, search_parent_directories=True) @@ -102,15 +109,18 @@ def get_git_info(self) -> BEL_GIT_ID: if origin_url_found: protocol, user, origin_url = origin_url_found.groups() - repo_path = absolute_path.as_posix()[len(repo.working_dir) + 1:] - commits = list(repo.iter_commits(paths=absolute_path)) + repo_path = absolute_path.as_posix()[len(repo.working_dir) + 1 :] + try: + commits = list(repo.iter_commits(paths=str(absolute_path))) - if commits: - commit = commits[0] - bel_git_id = BEL_GIT_ID(commit.hexsha, repo_path, origin_url) + if commits: + commit = commits[0] + bel_git_id = BEL_GIT_ID(commit.hexsha, repo_path, origin_url) + except: + print("Not able to get commit info") if not bel_git_id: - bel_git_id = BEL_GIT_ID('', '', '') + bel_git_id = BEL_GIT_ID("", "", "") return bel_git_id @@ -136,46 +146,47 @@ def insert(self) -> Tuple[bool, int]: def insert_bel_header(self, doc_info, defs) -> Tuple[bool, str]: """Insert header info (document_info, namespaces and annotations) plus git info).""" data = deepcopy(doc_info) - data['annotation'] = {} - data['namespace'] = {} + data["annotation"] = {} + data["namespace"] = {} for entry in defs: anno_or_ns = list(entry.keys())[0] - keyword = entry[anno_or_ns]['keyword'] - del entry[anno_or_ns]['keyword'] + keyword = entry[anno_or_ns]["keyword"] + del entry[anno_or_ns]["keyword"] data[anno_or_ns][keyword] = entry[anno_or_ns] - if 'authors' in data: - data['authors'] = [a.strip() for a in data['authors'].split(",")] + if "authors" in data: + data["authors"] = [a.strip() for a in data["authors"].split(",")] keyword_flag = False - if 'keywords' in data: + if "keywords" in data: keyword_flag = True - data['keywords'] = [k.strip() for k in data['keywords'].split(",")] + data["keywords"] = [k.strip() for k in data["keywords"].split(",")] - keyword_rids = self.get_keyword_rids(keywords=data['keywords']) + keyword_rids = self.get_keyword_rids(keywords=data["keywords"]) keyword_linkset = "[" + ",".join(keyword_rids) + "]" # This feature is a LINKSET - del data['keywords'] # TODO get so rids can be in JSON and load - see line 131 + del data["keywords"] # TODO get so rids can be in JSON and load - see line 131 - now = datetime.today().strftime('%Y-%m-%d %H:%M:%S') - data['date'] = {'uploaded': now} + now = datetime.today().strftime("%Y-%m-%d %H:%M:%S") + data["date"] = {"uploaded": now} file_stat = os.stat(self.file_path) - data['file'] = { - 'path': self.file_path, - 'md5': tools.md5(self.file_path), - 'size': file_stat.st_size, - 'last_modified': file_stat.st_mtime + data["file"] = { + "path": self.file_path, + "md5": tools.md5(self.file_path), + "size": file_stat.st_size, + "last_modified": file_stat.st_mtime, } git_info_dict = self.get_git_info() - data['git_info'] = git_info_dict._asdict() + data["git_info"] = git_info_dict._asdict() net_exists_before = self.network_exists( - data['version'], + data["version"], self.file_path, - data['file']['md5'], - data['file']['last_modified'], - git_info_dict) + data["file"]["md5"], + data["file"]["last_modified"], + git_info_dict, + ) if not net_exists_before: content = json.dumps(data) @@ -239,7 +250,6 @@ def insert_statements_and_sets(self, statements_and_sets, document_id) -> int: citation_type = "" for e in tqdm(statements_and_sets, desc="Insert BEL Statements"): - dtype, data = tuple(e.items())[0] if dtype == "sets": @@ -248,12 +258,12 @@ def insert_statements_and_sets(self, statements_and_sets, document_id) -> int: if key == "citation": citation = dict(value) - citation_type = citation['type'].strip() - citation_ref = citation['ref'].strip() + citation_type = citation["type"].strip() + citation_ref = citation["ref"].strip() evidence = "" annotation = defaultdict(set) - if citation['type'].lower() == "pubmed" and re.search(r'^\d+$', citation_ref): + if citation["type"].lower() == "pubmed" and re.search(r"^\d+$", citation_ref): pmid = citation_ref else: pmid = 0 @@ -270,38 +280,68 @@ def insert_statements_and_sets(self, statements_and_sets, document_id) -> int: annotation.pop(anno_keyword, None) elif dtype == "statement" and len(data) >= 1: + subject_id = self.get_node_id(data[0]["subject"])[1] - subject_id = self.get_node_id(data[0]['subject'])[1] - - if len(data) > 1 and 'object' in data[2]: + if len(data) > 1 and "object" in data[2]: # TODO: nested statements are missing - object_id = self.get_node_id(data[2]['object'])[1] + object_id = self.get_node_id(data[2]["object"])[1] - relation = data[1]['relation'] + relation = data[1]["relation"] - self.insert_bel_edge(annotation, citation, citation_ref, citation_type, evidence, object_id, pmid, - relation, subject_id, document_id) + self.insert_bel_edge( + annotation, + citation, + citation_ref, + citation_type, + evidence, + object_id, + pmid, + relation, + subject_id, + document_id, + ) return len(statements_and_sets) - def insert_bel_edge(self, annotation, citation, citation_ref, citation_type, evidence, object_id, pmid, relation, - subject_id, document_id): + def insert_bel_edge( + self, + annotation, + citation, + citation_ref, + citation_type, + evidence, + object_id, + pmid, + relation, + subject_id, + document_id, + ): # check if relation already exists anno = {key: sorted(list(annotation[key])) for key in annotation.keys()} anno_json = json.dumps(anno, sort_keys=True) - edge_content = {"pmid": pmid, - "citation": citation, - "annotation": anno, - "evidence": evidence, - "document": [document_id]} + edge_content = { + "pmid": pmid, + "citation": citation, + "annotation": anno, + "evidence": evidence, + "document": [document_id], + } # Need to clean the properties - evidence = evidence.replace('\n', ' ') + evidence = evidence.replace("\n", " ") citation_ref = citation_ref if citation_ref else None citation_type = citation_type if citation_type else None - edge_profile = (relation, subject_id, object_id, citation_type, citation_ref, evidence, anno_json) + edge_profile = ( + relation, + subject_id, + object_id, + citation_type, + citation_ref, + evidence, + anno_json, + ) edge_exists = True if edge_profile in _BelImporter._cache[EDGES] else False if not edge_exists: @@ -319,9 +359,11 @@ def insert_bel_edge(self, annotation, citation, citation_ref, citation_type, evi def get_json_parts(cls, bel_python_object) -> JsonParts: """Return the parts document,definitions,statements_and_sets from bel_json as python object.""" doc, defs, ss = bel_python_object - return JsonParts(document=doc['document'], - definitions=defs['definitions'], - statements_and_sets=ss['statements_and_sets']) + return JsonParts( + document=doc["document"], + definitions=defs["definitions"], + statements_and_sets=ss["statements_and_sets"], + ) @staticmethod def get_node_class_rid_from_db(obj, bel: str): @@ -329,8 +371,8 @@ def get_node_class_rid_from_db(obj, bel: str): rid = None node_class = None - if isinstance(obj[0], OrderedDict) and 'function' in obj[0]: - node_class = obj[0]['function']['name'] + if isinstance(obj[0], OrderedDict) and "function" in obj[0]: + node_class = obj[0]["function"]["name"] if (bel, node_class) in _BelImporter._cache[NODES]: rid = _BelImporter._cache[NODES][(bel, node_class)] @@ -338,7 +380,7 @@ def get_node_class_rid_from_db(obj, bel: str): @staticmethod def is_function(obj): - return isinstance(obj, OrderedDict) and 'function' in obj + return isinstance(obj, OrderedDict) and "function" in obj def get_node_id(self, obj: list) -> str: """Return rid of obj.""" @@ -350,9 +392,9 @@ def get_node_id(self, obj: list) -> str: bel = self.get_bel(obj) - node_class = obj[0]['function']['name'] + node_class = obj[0]["function"]["name"] - if node_class not in ['pmod', 'fragment', 'variant']: + if node_class not in ["pmod", "fragment", "variant"]: node_class, rid_from_db = self.get_node_class_rid_from_db(obj, bel) if rid_from_db: return node_class, rid_from_db @@ -380,7 +422,7 @@ def get_node_id(self, obj: list) -> str: def insert_bel_node(self, node_class, params, bel): """Insert bel node, return rid.""" - params.update({'bel': bel}) + params.update({"bel": bel}) sql_temp = "insert into {} content {}" sql = sql_temp.format(node_class, json.dumps(params)) record = self.client.command(sql) @@ -396,30 +438,30 @@ def get_bel_string(self, params, function_name=None): if isinstance(param, str): bels.append(param) elif isinstance(param, dict): - if set(param.keys()) == {'namespace', 'name'}: - bels.append(param['namespace'] + ':"' + param['name'] + '"') + if set(param.keys()) == {"namespace", "name"}: + bels.append(param["namespace"] + ':"' + param["name"] + '"') elif function_name == "fragment": - bels.append(','.join(['"' + x + '"' for x in param.values() if x])) + bels.append(",".join(['"' + x + '"' for x in param.values() if x])) elif function_name == "activity": - if param['namespace']: - bel_str = param['namespace'] + ':"' + param['name'] + '"' + if param["namespace"]: + bel_str = param["namespace"] + ':"' + param["name"] + '"' else: - bel_str = param['default'] + bel_str = param["default"] bels.append("ma(" + bel_str + ")") elif function_name == "pmod": - if param['namespace']: - first_part_pmod = param['namespace'] + ':"' + param['name'] + '"' + if param["namespace"]: + first_part_pmod = param["namespace"] + ':"' + param["name"] + '"' else: - first_part_pmod = normalized_pmod[param['type']] - position = str(param['position']) if param['position'] else None - parts_pmod = [first_part_pmod, param['amino_acid'], position] + first_part_pmod = normalized_pmod[param["type"]] + position = str(param["position"]) if param["position"] else None + parts_pmod = [first_part_pmod, param["amino_acid"], position] bels.append(",".join([x for x in parts_pmod if x])) else: - bels.append(','.join(['"' + str(x) + '"' for x in param.values() if x])) + bels.append(",".join(['"' + str(x) + '"' for x in param.values() if x])) joined_params = ",".join(bels) @@ -435,11 +477,9 @@ def get_bel(self, obj, parent_function=None): function_name = None for element in obj: - if isinstance(element, dict): - - if 'function' in element: - function_name = element['function']['name'] + if "function" in element: + function_name = element["function"]["name"] else: params.append(element) diff --git a/ebel/manager/orientdb/odb_defaults.py b/ebel/manager/orientdb/odb_defaults.py index bfadef7..9807383 100755 --- a/ebel/manager/orientdb/odb_defaults.py +++ b/ebel/manager/orientdb/odb_defaults.py @@ -1,41 +1,155 @@ """OrientDB defaults.""" from enum import Enum +from typing import Dict class OIndexType(Enum): """Allowed OrientDB Index types.""" - NOTUNIQUE = 'notunique' - UNIQUE = 'unique' - FULLTEXT = 'fulltext' - DICTIONARY = 'dictionary' - UNIQUE_HASH_INDEX = 'unique_hash_index' - NOTUNIQUE_HASH_INDEX = 'notunique_hash_index' - FULLTEXT_HASH_INDEX = 'fulltext_hash_index' - DICTIONARY_HASH_INDEX = 'dictionary_hash_index' + NOTUNIQUE = "notunique" + UNIQUE = "unique" + FULLTEXT = "fulltext" + DICTIONARY = "dictionary" + UNIQUE_HASH_INDEX = "unique_hash_index" + NOTUNIQUE_HASH_INDEX = "notunique_hash_index" + FULLTEXT_HASH_INDEX = "fulltext_hash_index" + DICTIONARY_HASH_INDEX = "dictionary_hash_index" class ODataType(Enum): """Allowed OrientDB Data types.""" - BOOLEAN = 'BOOLEAN' - BINARY = 'BINARY' - BYTE = 'BYTE' - DATE = 'DATE' - DATETIME = 'DATETIME' - DECIMAL = 'DECIMAL' - DOUBLE = 'DOUBLE' - EMBEDDED = 'EMBEDDED' - EMBEDDEDLIST = 'EMBEDDEDLIST' - EMBEDDEDSET = 'EMBEDDEDSET' - EMBEDDEDMAP = 'EMBEDDEDMAP' - FLOAT = 'FLOAT' - INTEGER = 'INTEGER' - LONG = 'LONG' - LINK = 'LINK' - LINKLIST = 'LINKLIST' - LINKSET = 'LINKSET' - LINKMAP = 'LINKMAP' - LINKBAG = 'LINKBAG' - STRING = 'STRING' - SHORT = 'SHORT' + BOOLEAN = "BOOLEAN" + BINARY = "BINARY" + BYTE = "BYTE" + DATE = "DATE" + DATETIME = "DATETIME" + DECIMAL = "DECIMAL" + DOUBLE = "DOUBLE" + EMBEDDED = "EMBEDDED" + EMBEDDEDLIST = "EMBEDDEDLIST" + EMBEDDEDSET = "EMBEDDEDSET" + EMBEDDEDMAP = "EMBEDDEDMAP" + FLOAT = "FLOAT" + INTEGER = "INTEGER" + LONG = "LONG" + LINK = "LINK" + LINKLIST = "LINKLIST" + LINKSET = "LINKSET" + LINKMAP = "LINKMAP" + LINKBAG = "LINKBAG" + STRING = "STRING" + SHORT = "SHORT" + + +bel_func_short: Dict[str, str] = { + "gmod": "gmod", + "protein": "p", + "abundance": "a", + "micro_rna": "m", + "rna": "r", + "gene": "g", + "activity": "act", + "fragment": "frag", + "pmod": "pmod", + "location": "loc", + "variant": "var", + "complex": "complex", + "reaction": "rxn", + "reactants": "reactants", + "products": "products", + "pathology": "path", + "degradation": "deg", + "biological_process": "bp", + "list": "list", + "cell_secretion": "sec", + "composite": "composite", + "translocation": "tloc", + "fusion_protein": "fus", + "fusion_rna": "fus", + "fusion_gene": "fus", + "from_location": "fromLoc", + "to_location": "toLoc", + "cell_surface_expression": "surf", + "population": "pop", +} + +normalized_pmod: Dict[str, str] = { + "ace": "Ac", + "adr": "ADPRib", + "add": '"ADP-rybosylation"', + "far": "Farn", + "ger": "Gerger", + "gly": "Glyco", + "hyd": "Hy", + "isg": "ISG", + "me0": "Me", + "me1": "Me1", + "mon": "monomethylation", + "me2": "Me2", + "me3": "Me3", + "tri": "trimethylation", + "myr": "Myr", + "ned": "Nedd", + "ngl": "NGlyco", + "nit": "NO", + "ogl": "OGlyco", + "pal": "Palm", + "pho": "Ph", + "sul": "Sulf", + "sup": "sulphation", + "suh": "sulfonation", + "sum": "sulphonation", + "suy": "Sumo", + "ubi": "Ub", + "u48": "UbK48", + "u63": "UbK63", + "ubm": "UbMono", + "ubp": "UbPoly", + "pre": "Prenylation", # added for BioGrid + "dei": "de-ISGylation", # added for BioGrid + "fat": "FAT10ylation", + "ufm": "Ufmylation", # added for BioGrid +} + +normalized_pmod_reverse = {v: k for k, v in normalized_pmod.items()} + + +class BelPmod: + """Protein modification definitions.""" + + ACE = normalized_pmod["ace"] + ADR = normalized_pmod["adr"] + ADD = normalized_pmod["add"] + FAR = normalized_pmod["far"] + GER = normalized_pmod["ger"] + GLY = normalized_pmod["gly"] + HYD = normalized_pmod["hyd"] + ISG = normalized_pmod["isg"] + ME0 = normalized_pmod["me0"] + ME1 = normalized_pmod["me1"] + MON = normalized_pmod["mon"] + ME2 = normalized_pmod["me2"] + ME3 = normalized_pmod["me3"] + TRI = normalized_pmod["tri"] + MYR = normalized_pmod["myr"] + NED = normalized_pmod["ned"] + NGL = normalized_pmod["ngl"] + NIT = normalized_pmod["nit"] + OGL = normalized_pmod["ogl"] + PAL = normalized_pmod["pal"] + PHO = normalized_pmod["pho"] + SUL = normalized_pmod["sul"] + SUP = normalized_pmod["sup"] + SUH = normalized_pmod["suh"] + SUM = normalized_pmod["sum"] + SUY = normalized_pmod["suy"] + UBI = normalized_pmod["ubi"] + U48 = normalized_pmod["u48"] + U63 = normalized_pmod["u63"] + UBM = normalized_pmod["ubm"] + UBP = normalized_pmod["ubp"] + PRE = normalized_pmod["pre"] # added for BioGrid + DEI = normalized_pmod["dei"] # added for BioGrid + FAT = normalized_pmod["fat"] + UFM = normalized_pmod["ufm"] # added for BioGrid diff --git a/ebel/manager/orientdb/odb_meta.py b/ebel/manager/orientdb/odb_meta.py index aa39e48..cb9b95a 100644 --- a/ebel/manager/orientdb/odb_meta.py +++ b/ebel/manager/orientdb/odb_meta.py @@ -7,36 +7,36 @@ import random import socket import time - from abc import abstractmethod -from collections import defaultdict, OrderedDict +from collections import OrderedDict, defaultdict from http.client import RemoteDisconnected from shutil import copyfileobj from types import GeneratorType -from typing import List, Iterable, Dict, Union, Tuple, Set, Optional -from urllib.request import urlopen, Request +from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from urllib.request import Request, urlopen import numpy as np import pandas as pd import requests -import xmltodict import sqlalchemy as sqla -from sqlalchemy.sql.schema import Table -from sqlalchemy_utils import database_exists, create_database - +import xmltodict from pyorientdb import OrientDB, orient -from pyorientdb.exceptions import PyOrientIndexException, PyOrientCommandException, PyOrientSecurityAccessException +from pyorientdb.exceptions import (PyOrientCommandException, + PyOrientIndexException, + PyOrientSecurityAccessException) from pyorientdb.otypes import OrientRecord - +from sqlalchemy.sql.schema import Table +from sqlalchemy_utils import create_database, database_exists from tqdm import tqdm import ebel.database -from ebel.constants import RID, DEFAULT_ODB from ebel.cache import set_mysql_interactive +from ebel.config import get_config_as_dict, get_config_value, write_to_config +from ebel.constants import DEFAULT_ODB, RID from ebel.manager.orientdb import urls as default_urls -from ebel.manager.orientdb.odb_structure import OClass, OIndex, OProperty, Edge, Generic, Node -from ebel.tools import BelRdb, get_file_path, chunks, get_standard_name -from ebel.config import write_to_config, get_config_as_dict +from ebel.manager.orientdb.odb_structure import (Edge, Generic, Node, OClass, + OIndex, OProperty) +from ebel.tools import BelRdb, chunks, get_file_path, get_standard_name type_map_inverse = {v: k for k, v in orient.type_map.items()} @@ -55,16 +55,16 @@ class Graph(abc.ABC): """Generic parent class for BioDBs.""" def __init__( - self, - generics: Tuple[Generic] = (), - nodes: Tuple[Node] = (), - edges: Tuple[Edge] = (), - indices: Tuple[OIndex] = (), - urls: dict = None, - biodb_name: str = '', - tables_base=None, - config_params: Optional[dict] = None, - overwrite_config: bool = False, + self, + generics: Tuple[Generic] = (), + nodes: Tuple[Node] = (), + edges: Tuple[Edge] = (), + indices: Tuple[OIndex] = (), + urls: dict = None, + biodb_name: str = "", + tables_base=None, + config_params: Optional[dict] = None, + overwrite_config: bool = False, ): """Init method.""" self.generic_classes: Tuple[OClass] = generics @@ -76,11 +76,11 @@ def __init__( self.urls = urls if urls else {} self.biodb_name = biodb_name - self.odb_db_name = config_params['db'] if config_params and 'db' in config_params else None - self.odb_user = config_params['user'] if config_params and 'user' in config_params else None - self.odb_password = config_params['password'] if config_params and 'password' in config_params else None - self.odb_server = config_params['server'] if config_params and 'server' in config_params else 'localhost' - self.odb_port = config_params['port'] if config_params and 'port' in config_params else '2424' + self.odb_db_name = config_params["db"] if config_params and "db" in config_params else None + self.odb_user = config_params["user"] if config_params and "user" in config_params else None + self.odb_password = config_params["password"] if config_params and "password" in config_params else None + self.odb_server = config_params["server"] if config_params and "server" in config_params else "localhost" + self.odb_port = config_params["port"] if config_params and "port" in config_params else "2424" self.odb_user_reader = None self.odb_user_reader_password = None # Root password should not be set, but can be @@ -96,7 +96,7 @@ def __init__( self.engine = rdb.engine self.session = rdb.session - if not database_exists(self.engine.url): + if not (get_config_value("DATABASE", "sqlalchemy_connection_string") or database_exists(self.engine.url)): if str(self.engine.url).startswith("mysql"): set_mysql_interactive() @@ -114,7 +114,7 @@ def __config_params_check(self, overwrite_config: bool = False): "password": self.odb_password, "server": self.odb_server, "port": int(self.odb_port), - "root_password": self._odb_root_password + "root_password": self._odb_root_password, } # If values are passed @@ -134,8 +134,8 @@ def __config_params_check(self, overwrite_config: bool = False): # No parameters passed and no saved parameters in config means user needs to provide some info else: missing_params = ", ".join({key for key, val in credentials.items() if val is None}) + logger.info(f"Initial configuration parameters missing. Missing parameters: {missing_params}") raise ValueError(f"Please provide initial configuration parameters. Missing parameters: {missing_params}") - # logger.error(f"Please provide initial configuration parameters. Missing parameters: {missing_params}") def execute(self, command_str: str) -> List[OrientRecord]: """Execute a command directly in the OrientDB server. @@ -171,22 +171,29 @@ def set_configuration_parameters(self): """Set configuration for OrientDB database client instance using configuration file or passed params.""" odb_config = get_config_as_dict().get(DEFAULT_ODB) - self.odb_db_name = self.odb_db_name or odb_config.get('name') - self.odb_user = self.odb_user or odb_config.get('user') - self.odb_password = self.odb_password or odb_config.get('password') - self.odb_server = self.odb_server or odb_config.get('server') - self.odb_port = int(self.odb_port or odb_config.get('port') or '2424') - self.odb_user_reader = self.odb_user_reader or odb_config.get('user_reader') or None - self.odb_user_reader_password = self.odb_user_reader_password or odb_config.get('user_reader_password') or None + self.odb_db_name = self.odb_db_name or odb_config.get("name") + self.odb_user = self.odb_user or odb_config.get("user") + self.odb_password = self.odb_password or odb_config.get("password") + self.odb_server = self.odb_server or odb_config.get("server") + self.odb_port = int(self.odb_port or odb_config.get("port") or "2424") + self.odb_user_reader = self.odb_user_reader or odb_config.get("user_reader") or None + self.odb_user_reader_password = self.odb_user_reader_password or odb_config.get("user_reader_password") or None # Root password should not be written in the config file, but it's possible - self._odb_root_password = self._odb_root_password or odb_config.get('root_password') + self._odb_root_password = self._odb_root_password or odb_config.get("root_password") def get_client(self) -> OrientDB: """Attempts to connect to the OrientDB client. This is currently done by using session tokens.""" - client = ebel.database.get_orientdb_client(self.odb_server, self.odb_port, self.odb_db_name, self.odb_user, - self.odb_password, self._odb_root_password, self.odb_user_reader, - self.odb_user_reader_password) + client = ebel.database.get_orientdb_client( + self.odb_server, + self.odb_port, + self.odb_db_name, + self.odb_user, + self.odb_password, + self._odb_root_password, + self.odb_user_reader, + self.odb_user_reader_password, + ) return client @@ -198,7 +205,7 @@ def __repr__(self): url=self.urls, edges={k: v for k, v in self.number_of_edges.items() if v}, nodes={k: v for k, v in self.number_of_nodes.items() if v}, - generics={k: v for k, v in self.number_of_generics.items() if v} + generics={k: v for k, v in self.number_of_generics.items() if v}, ) return representation @@ -236,7 +243,7 @@ def create_index_rdbms(self, table_name: str, columns): """Creates index on column(s) in RDBMS.""" if isinstance(columns, str): columns = [columns] - sql_columns = ','.join(columns) + sql_columns = ",".join(columns) index_name = f"idx_{table_name}_" + "_".join(columns) self.engine.execute(f"CREATE INDEX {index_name} ON {table_name} ({sql_columns})") @@ -244,17 +251,19 @@ def clear_edges_by_bel_doc_rid(self, bel_document_rid: str, even_if_other_doc_ri """Delete all edges linked to a specified BEL document rID.""" changes = 0 if even_if_other_doc_rids_exists: - sql = f'DELETE FROM `bel_relation` WHERE {bel_document_rid} IN document' + sql = f"DELETE FROM `bel_relation` WHERE {bel_document_rid} IN document" changes = self.execute(sql)[0] else: - sql = f'SELECT @rid.asString() AS rid, document.asString() FROM `bel_relation` ' \ - f'WHERE {bel_document_rid} IN document' + sql = ( + f"SELECT @rid.asString() AS rid, document.asString() FROM `bel_relation` " + f"WHERE {bel_document_rid} IN document" + ) for r in self.query_get_dict(sql): - new_rids = [x.strip() for x in r['document'][1:-1].split(',')] + new_rids = [x.strip() for x in r["document"][1:-1].split(",")] new_rids.remove(bel_document_rid) if new_rids: - new_rids_str = ','.join(new_rids) + new_rids_str = ",".join(new_rids) self.execute(f"UPDATE {r['rid']} SET document = [{new_rids_str}]") changes += 1 else: @@ -267,20 +276,20 @@ def clear_edges_by_bel_doc_rid(self, bel_document_rid: str, even_if_other_doc_ri def clear_documents(self) -> int: """Clear all document info. Returns number of deleted documents.""" - return self.execute('Delete from `bel_document`')[0] + return self.execute("Delete from `bel_document`")[0] def get_number_of_bel_statements_by_document_rid(self, bel_document_rid: str) -> int: """Return BEL statement count with a given document ID.""" sql = f"Select count(*) as num from bel_relation where document = {bel_document_rid}" - return self.execute(sql)[0].oRecordData['num'] + return self.execute(sql)[0].oRecordData["num"] def get_documents(self): """Return all document info as pandas DataFrame.""" - return self.query('Select * from bel_document') + return self.query("Select * from bel_document") def get_documents_as_dict(self): """Return all document info as pandas DataFrame.""" - return [x.oRecordData for x in self.execute('Select * from bel_document')] + return [x.oRecordData for x in self.execute("Select * from bel_document")] def add_keyword(self, keyword: str, description: str) -> pd.DataFrame: """Add a keyword and description used to tagging BEL documents. @@ -307,7 +316,7 @@ def get_info_properties(self, class_name: str, short: bool = True): o_record_datas = [x.oRecordData for x in self.execute(sql)] if short: - props = [{'name': x['name'], 'type': type_map_inverse[x['type']]} for x in o_record_datas] + props = [{"name": x["name"], "type": type_map_inverse[x["type"]]} for x in o_record_datas] else: props = o_record_datas @@ -339,10 +348,21 @@ def query_get_dict(self, sql: str) -> List[dict]: results = self.execute(sql) return [x.oRecordData for x in results] - def query_class(self, class_name: str, limit: int = 0, skip: int = 0, columns: Iterable[str] = None, - with_rid=True, with_class=False, print_sql: bool = False, group_by: List[str] = None, - distinct=False, as_dataframe: bool = False, where_list: Tuple[str] = (), - **params) -> Union[List[dict], pd.DataFrame]: + def query_class( + self, + class_name: str, + limit: int = 0, + skip: int = 0, + columns: Iterable[str] = None, + with_rid=True, + with_class=False, + print_sql: bool = False, + group_by: List[str] = None, + distinct=False, + as_dataframe: bool = False, + where_list: Tuple[str] = (), + **params, + ) -> Union[List[dict], pd.DataFrame]: """Query class by params and returns list of pyorient.OrientRecord.""" if not self.class_exists(class_name): raise ExceptionClassNotExists("Class {} not exists in database.".format(class_name)) @@ -375,19 +395,21 @@ def query_class(self, class_name: str, limit: int = 0, skip: int = 0, columns: I group_by = "GROUP BY " + ", ".join(group_by) else: - group_by = '' + group_by = "" if distinct and len(cols) == 1: sql_cols = "distinct({})".format(sql_cols) sql_temp = "SELECT {sql_cols} FROM `{class_name}` {where} {group_by} {sql_limit} {sql_skip}" - sql = sql_temp.format(sql_cols=sql_cols, - class_name=class_name, - where=where, - sql_limit=sql_limit, - sql_skip=sql_skip, - group_by=group_by) + sql = sql_temp.format( + sql_cols=sql_cols, + class_name=class_name, + where=where, + sql_limit=sql_limit, + sql_skip=sql_skip, + group_by=group_by, + ) if print_sql: print(sql) @@ -402,41 +424,53 @@ def query_class(self, class_name: str, limit: int = 0, skip: int = 0, columns: I return return_value - def query_class_chunks(self, class_name: str, chunk_size: int = 10000, columns: Iterable[str] = None, - with_rid=True, with_class=False, print_sql: bool = False, group_by: List[str] = None, - distinct: bool = False): + def query_class_chunks( + self, + class_name: str, + chunk_size: int = 10000, + columns: Iterable[str] = None, + with_rid=True, + with_class=False, + print_sql: bool = False, + group_by: List[str] = None, + distinct: bool = False, + ): """Query class by params and only return a set of results in batches. Creates a generator.""" number_entries = self.get_number_of_class(class_name) total_num_chunks = (number_entries // chunk_size) + 1 chunk_index = 0 while chunk_index < total_num_chunks: - generic_table_chunk = self.query_class(class_name=class_name, - limit=chunk_size, - skip=chunk_size * chunk_index, - with_rid=with_rid, - with_class=with_class, - print_sql=print_sql, - group_by=group_by, - distinct=distinct, - columns=columns) + generic_table_chunk = self.query_class( + class_name=class_name, + limit=chunk_size, + skip=chunk_size * chunk_index, + with_rid=with_rid, + with_class=with_class, + print_sql=print_sql, + group_by=group_by, + distinct=distinct, + columns=columns, + ) yield generic_table_chunk chunk_index += 1 def query_rid(self, rid, columns: list = None): """Query specified columns of a given rID entry.""" table_columns = columns if columns else [] - columns_in_sql = ', '.join(table_columns) + columns_in_sql = ", ".join(table_columns) results = self.execute(f"Select {columns_in_sql} from {rid}") if results: return [x.oRecordData for x in results][0] - def download(self, - url_dict: Dict[str, str] = None, - biodb: str = None, - expiration_days: int = 100) -> Dict[str, bool]: + def download( + self, + url_dict: Dict[str, str] = None, + biodb: str = None, + expiration_days: int = 100, + ) -> Dict[str, bool]: """Download url to file_path if not older than expiration_days.""" - if not url_dict and hasattr(self, 'urls'): + if not url_dict and hasattr(self, "urls"): url_dict = self.urls downloaded = {} @@ -451,8 +485,9 @@ def download_file(url: str, biodb: str, expiration_days: int = 100, addtional_he """Download file. Returns True if it was needed to download the file.""" header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/" - "537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", - "X-Requested-With": "XMLHttpRequest"} + "537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + } if addtional_header: header.update(addtional_header) @@ -470,13 +505,16 @@ def download_file(url: str, biodb: str, expiration_days: int = 100, addtional_he if not file_exists or expired: logger.info(f"Download {url}") try: - with urlopen(Request(url, headers=header)) as response, open(file_path, 'wb') as out_file: + with urlopen(Request(url, headers=header)) as response, open(file_path, "wb") as out_file: copyfileobj(response, out_file) downloaded = True except RemoteDisconnected: - logger.error(f"{url} could not be downloaded! Website appears to be down.", exc_info=False) + logger.error( + f"{url} could not be downloaded! Website appears to be down.", + exc_info=False, + ) except Exception: logger.error(f"{url} could not be downloaded!", exc_info=True) @@ -486,7 +524,9 @@ def download_file(url: str, biodb: str, expiration_days: int = 100, addtional_he def index_exists(self, index_name: str): """Check if index_name exists.""" sql = """Select 1 as exists from (select expand(indexes) - from metadata:indexmanager) where name = '{}'""".format(index_name) + from metadata:indexmanager) where name = '{}'""".format( + index_name + ) return len(self.execute(sql)) > 0 def create_index(self, index: OIndex): @@ -495,10 +535,12 @@ def create_index(self, index: OIndex): index_name = self.get_index_name(index) if not self.index_exists(index_name): - sql = sql.format(class_name=index.class_name, - columns=",".join(index.columns), - index_name=self.get_index_name(index), - index_type=index.index_type.value) + sql = sql.format( + class_name=index.class_name, + columns=",".join(index.columns), + index_name=self.get_index_name(index), + index_type=index.index_type.value, + ) try: self.execute(sql) @@ -588,12 +630,12 @@ def create_class(self, oclass: OClass, print_sql=False): """ sql_class = "CREATE CLASS {name} IF NOT EXISTS {extends} {abstract}" - sql_dict = {"name": "`" + oclass.name + "`", "extends": '', "abstract": ''} + sql_dict = {"name": "`" + oclass.name + "`", "extends": "", "abstract": ""} if oclass.extends: - sql_dict['extends'] = "EXTENDS " + ", ".join(["`" + x + "`" for x in oclass.extends]) + sql_dict["extends"] = "EXTENDS " + ", ".join(["`" + x + "`" for x in oclass.extends]) if oclass.abstract: - sql_dict['abstract'] = "ABSTRACT" + sql_dict["abstract"] = "ABSTRACT" sql = sql_class.format(**sql_dict) if print_sql: @@ -607,24 +649,26 @@ def create_class(self, oclass: OClass, print_sql=False): if isinstance(oclass, Edge): sql_in_out = f"CREATE PROPERTY {oclass.name}.{{}} IF NOT EXISTS LINK {{}}" if oclass.in_out[0]: - self.execute(sql_in_out.format('in', oclass.in_out[0])) + self.execute(sql_in_out.format("in", oclass.in_out[0])) if oclass.in_out[1]: - self.execute(sql_in_out.format('out', oclass.in_out[1])) + self.execute(sql_in_out.format("out", oclass.in_out[1])) def create_class_property(self, class_name: str, prop: OProperty, print_sql: bool = False): """Create OrientDB class property.""" sql_prop = "CREATE PROPERTY `{name}`.`{prop_name}`\ IF NOT EXISTS {dtype} {linked_class} {linked_type} {mandatory}" - linked_class = prop.linked_class or '' - linked_type = prop.linked_type.value if prop.linked_type else '' - mandatory = "(MANDATORY TRUE)" if prop.mandatory else '' - sql = sql_prop.format(name=class_name, - prop_name=prop.prop_name, - dtype=prop.data_type.value, - linked_class=linked_class, - linked_type=linked_type, - mandatory=mandatory) + linked_class = prop.linked_class or "" + linked_type = prop.linked_type.value if prop.linked_type else "" + mandatory = "(MANDATORY TRUE)" if prop.mandatory else "" + sql = sql_prop.format( + name=class_name, + prop_name=prop.prop_name, + dtype=prop.data_type.value, + linked_class=linked_class, + linked_type=linked_type, + mandatory=mandatory, + ) if print_sql: print(sql) self.execute(sql) @@ -673,9 +717,11 @@ def drop_edge_classes(self): def clear(self): """Clear (delete entries) from all classes.""" - return {'nodes': self.clear_nodes(), - 'edges': self.clear_edges(), - 'generics': self.clear_generics()} + return { + "nodes": self.clear_nodes(), + "edges": self.clear_edges(), + "generics": self.clear_generics(), + } def is_abstract_class(self, class_name: str) -> bool: """Returns true if class is abstract.""" @@ -688,7 +734,7 @@ def clear_edges(self) -> Dict[str, int]: """Delete all edges.""" deleted = {} sql_temp = "DELETE EDGE `{}`" - if hasattr(self, 'edge_classes'): + if hasattr(self, "edge_classes"): for oclass in self.edge_classes: if oclass.own_class and not oclass.abstract and self.class_exists(oclass.name): sql = sql_temp.format(oclass.name) @@ -739,17 +785,17 @@ def clear_nodes_and_edges(self): """Delete all nodes and edges of a specific biodb.""" number_of_deleted_edges = self.clear_edges() number_of_deleted_nodes = self.clear_nodes() - return {'nodes': number_of_deleted_nodes, 'edges': number_of_deleted_edges} + return {"nodes": number_of_deleted_nodes, "edges": number_of_deleted_edges} def clear_all_nodes_and_edges(self): """Delete all nodes and edges in the whole database.""" edges_deleted = self.execute("Delete edge E")[0] nodes_deleted = self.execute("Delete vertex V")[0] - return {'edges': edges_deleted, 'nodes': nodes_deleted} + return {"edges": edges_deleted, "nodes": nodes_deleted} def clear_exp_edges(self): """Delete all DEA experiment associated edges.""" - deleted = self.clear_class(class_name='ko_relation') + deleted = self.clear_class(class_name="ko_relation") return deleted def recreate(self): @@ -788,7 +834,7 @@ def __get_sql_where_part(params, where_list: Tuple[str] = ()): where_list = list(where_list) for column, value in params.items(): if isinstance(value, (str, list, dict)): - if value == 'notnull': + if value == "notnull": where_list.append("`{}` IS NOT NULL".format(column)) else: where_list.append("`{}` = {}".format(column, json.dumps(value))) @@ -803,12 +849,12 @@ def __get_sql_where_part(params, where_list: Tuple[str] = ()): def get_number_of_class(self, class_name, distinct_column_name: str = None, **params): """Return count of unique values for a given class_name and column name.""" - column = '*' + column = "*" if distinct_column_name: - column = f'distinct({distinct_column_name})' + column = f"distinct({distinct_column_name})" where = self.__get_sql_where_part(params) sql = f"Select count(`{column}`) from `{class_name}`{where}" - return self.execute(sql)[0].oRecordData['count'] + return self.execute(sql)[0].oRecordData["count"] @property def number_of_nodes(self): @@ -833,7 +879,7 @@ def get_cluster_ids(self, class_name: str) -> list: """Get all cluster ids by class name.""" sql_cids = "Select clusterIds from (select expand(classes) from metadata:schema) where name = '{}'" if class_name not in self.cluster_ids: - cids = self.execute(sql_cids.format(class_name))[0].oRecordData['clusterIds'] + cids = self.execute(sql_cids.format(class_name))[0].oRecordData["clusterIds"] self.cluster_ids[class_name] = cids return self.cluster_ids[class_name] @@ -856,7 +902,7 @@ def create_record(self, class_name: str, value_dict: dict) -> Optional[str]: cid = random.choice(self.get_cluster_ids(class_name)) try: - r = self.client.record_create(cid, {'@' + class_name: value_dict})._OrientRecord__rid + r = self.client.record_create(cid, {"@" + class_name: value_dict})._OrientRecord__rid except (PyOrientCommandException, socket.timeout): logger.warning("Standard insert in odb_meta.create_record did not work!") @@ -871,19 +917,24 @@ def create_record(self, class_name: str, value_dict: dict) -> Optional[str]: def update_record(self, class_name: str, value_dict: dict) -> str: """Update record with content of value_dict.""" cid = random.choice(self.get_cluster_ids(class_name)) - r = self.client.record_update(cid, {'@' + class_name: value_dict}) + r = self.client.record_update(cid, {"@" + class_name: value_dict}) return r._OrientRecord__rid def edge_exists(self, class_name: str, from_rid: str, to_rid: str, value_dict: dict = {}) -> str: """Check if edge exists. Return rid if exists else None.""" data = copy.deepcopy(value_dict) # deep copy and DO NOT change the dictionary!!! - data.update({'out.@rid': from_rid, 'in.@rid': to_rid}) + data.update({"out.@rid": from_rid, "in.@rid": to_rid}) result = self.query_class(class_name, limit=1, **data) if result: return result[0][RID] - def node_exists(self, class_name: str, value_dict: dict = {}, check_for: Union[Iterable[str], str] = None, - print_sql: bool = False) -> str: + def node_exists( + self, + class_name: str, + value_dict: dict = {}, + check_for: Union[Iterable[str], str] = None, + print_sql: bool = False, + ) -> str: """Check if node exists. Return rid if exists else None.""" check_for_dict = value_dict.copy() if check_for: @@ -893,10 +944,18 @@ def node_exists(self, class_name: str, value_dict: dict = {}, check_for: Union[I if result: return result[0][RID] - def create_edge(self, class_name: str, from_rid: str, to_rid: str, value_dict: dict = {}, print_sql=False, - if_not_exists=False, ignore_empty_values=False) -> str: + def create_edge( + self, + class_name: str, + from_rid: str, + to_rid: str, + value_dict: dict = {}, + print_sql=False, + if_not_exists=False, + ignore_empty_values=False, + ) -> str: """Create edge from from_rid(@rid) to to_rid(@rid) with content of value_dict.""" - content = '' + content = "" if if_not_exists: edge_rid = self.edge_exists(class_name, from_rid, to_rid, value_dict) if edge_rid: @@ -915,7 +974,12 @@ def create_edge(self, class_name: str, from_rid: str, to_rid: str, value_dict: d def get_create_rid(self, class_name: str, value_dict: dict, check_for=None, print_sql=False) -> str: """Return class_name.@rid by value_dict. Create record/insert if not exists.""" - rid = self.node_exists(class_name=class_name, value_dict=value_dict, check_for=check_for, print_sql=print_sql) + rid = self.node_exists( + class_name=class_name, + value_dict=value_dict, + check_for=check_for, + print_sql=print_sql, + ) if not rid: rid = self.insert_record(class_name=class_name, value_dict=value_dict, print_sql=print_sql) return rid @@ -926,13 +990,18 @@ def update_correlative_edges(self) -> List[str]: correlative_edges = self.query_class(class_name="correlative", with_rid=False, with_class=True) for c_edge in tqdm(correlative_edges, desc="Creating reverse correlative edges"): - from_rid = c_edge.pop('in').get_hash() - to_rid = c_edge.pop('out').get_hash() + from_rid = c_edge.pop("in").get_hash() + to_rid = c_edge.pop("out").get_hash() edge_class = c_edge.pop("class") - c_edge['document'] = [doc.get_hash() for doc in c_edge['document']] - - new_rid = self.create_edge(class_name=edge_class, from_rid=from_rid, to_rid=to_rid, - value_dict=c_edge, if_not_exists=True) + c_edge["document"] = [doc.get_hash() for doc in c_edge["document"]] + + new_rid = self.create_edge( + class_name=edge_class, + from_rid=from_rid, + to_rid=to_rid, + value_dict=c_edge, + if_not_exists=True, + ) updated_edges.append(new_rid) return updated_edges @@ -949,25 +1018,25 @@ def update_pmcids(self) -> int: results = self.query(missing_pmc_sql) updated = 0 - if results is not None and 'pmid' in results: - pmids_with_missing_pmc = list(results['pmid']) + if results is not None and "pmid" in results: + pmids_with_missing_pmc = list(results["pmid"]) id_dict = dict() for pmid_sublist in chunks(pmids_with_missing_pmc, size=200): - pmid_string = ','.join([str(x) for x in pmid_sublist]) + pmid_string = ",".join([str(x) for x in pmid_sublist]) url_filled = default_urls.NCBI_PMID.format(pmid_string) api_query_response = requests.get(url_filled) pmcids_json = json.loads(api_query_response.text) - if 'records' in pmcids_json.keys(): - for record in pmcids_json['records']: - if 'pmid' in record.keys(): - if 'pmcid' in record.keys(): - id_dict[record['pmid']] = record['pmcid'] + if "records" in pmcids_json.keys(): + for record in pmcids_json["records"]: + if "pmid" in record.keys(): + if "pmcid" in record.keys(): + id_dict[record["pmid"]] = record["pmcid"] else: - id_dict[record['pmid']] = None + id_dict[record["pmid"]] = None for pmid, pmc in tqdm(id_dict.items(), desc="Updating PMC IDs"): if pmc: @@ -977,7 +1046,7 @@ def update_pmcids(self) -> int: return updated - def update_pmids(self, edge_name='bel_relation'): + def update_pmids(self, edge_name="bel_relation"): """Update PMID metadata for all edges of the specified edge_name.""" sql_missing_citations = """Select distinct(pmid) as pmid from {} @@ -986,7 +1055,7 @@ def update_pmids(self, edge_name='bel_relation'): pmid > 0 and citation.pubdate IS NULL""" r = self.execute(sql_missing_citations.format(edge_name)) - pmids = [x.oRecordData['pmid'] for x in r] + pmids = [x.oRecordData["pmid"] for x in r] updated = 0 @@ -997,9 +1066,9 @@ def update_pmids(self, edge_name='bel_relation'): total = len(pmids) // chunk_size + 1 for pmid_chunk in tqdm( - chunks(pmids, size=chunk_size), - total=total, - desc=f"Update PMID citations in {edge_name}" + chunks(pmids, size=chunk_size), + total=total, + desc=f"Update PMID citations in {edge_name}", ): try: updated += self._query_ncbi(pmid_chunk, edge_name) @@ -1017,7 +1086,7 @@ def _query_ncbi(self, pmid_chunk: GeneratorType, edge_name: str): sql_update_mesh_terms = "Update {} set annotation.mesh = {} where pmid = {}" sql_update_mesh_substances = "Update {} set annotation.substances = {} where pmid = {}" - nameset = {'LastName', 'Initials'} + nameset = {"LastName", "Initials"} start_time = time.time() url = default_urls.NCBI_MESH + ",".join([str(x) for x in pmid_chunk]) @@ -1025,86 +1094,88 @@ def _query_ncbi(self, pmid_chunk: GeneratorType, edge_name: str): parsed_data = xmltodict.parse(xml_pubmed.text) updated = 0 - if 'PubmedArticleSet' in parsed_data: - medline_citations = parsed_data['PubmedArticleSet']['PubmedArticle'] + if "PubmedArticleSet" in parsed_data: + medline_citations = parsed_data["PubmedArticleSet"]["PubmedArticle"] if isinstance(medline_citations, OrderedDict): - medline_citations = [medline_citations, ] + medline_citations = [ + medline_citations, + ] for medlineCitation in medline_citations: - mc = medlineCitation['MedlineCitation'] - data = {'type': "PubMed", 'ref': mc['PMID']['#text']} - article = mc['Article'] + mc = medlineCitation["MedlineCitation"] + data = {"type": "PubMed", "ref": mc["PMID"]["#text"]} + article = mc["Article"] # Authors author_list = [] - if 'AuthorList' in article: # Authors not always listed - authors = article['AuthorList']['Author'] + if "AuthorList" in article: # Authors not always listed + authors = article["AuthorList"]["Author"] if isinstance(authors, OrderedDict): authors = [authors] for author in authors: if isinstance(author, OrderedDict) and nameset.issubset(author.keys()): - author_list.append(author['LastName'] + " " + author['Initials']) + author_list.append(author["LastName"] + " " + author["Initials"]) - data['author_list'] = author_list + data["author_list"] = author_list if author_list: - data['last_author'] = author_list[-1] + data["last_author"] = author_list[-1] # Journal - data['full_journal_name'] = mc['Article']['Journal']['Title'] + data["full_journal_name"] = mc["Article"]["Journal"]["Title"] # Article Date - article_date = mc.get('DateCompleted', mc.get('DateRevised')) + article_date = mc.get("DateCompleted", mc.get("DateRevised")) if article_date: ad = article_date.values() - data['pub_date'] = '-'.join(ad) - data['pub_year'] = int(list(ad)[0]) + data["pub_date"] = "-".join(ad) + data["pub_year"] = int(list(ad)[0]) # Title - data['title'] = mc['Article']['ArticleTitle'] + data["title"] = mc["Article"]["ArticleTitle"] # DOI - if 'ELocationID' in mc['Article']: - eids = mc['Article']['ELocationID'] + if "ELocationID" in mc["Article"]: + eids = mc["Article"]["ELocationID"] if isinstance(eids, OrderedDict): - eids = [mc['Article']['ELocationID']] + eids = [mc["Article"]["ELocationID"]] for eid in eids: - if eid.get('@EIdType') == 'doi': - data['doi'] = eid['#text'] + if eid.get("@EIdType") == "doi": + data["doi"] = eid["#text"] # MeSH Headings meshs = [] - if 'MeshHeadingList' in mc: - for mesh in mc['MeshHeadingList']['MeshHeading']: - m = mesh['DescriptorName'] - meshs.append(m['#text']) + if "MeshHeadingList" in mc: + for mesh in mc["MeshHeadingList"]["MeshHeading"]: + m = mesh["DescriptorName"] + meshs.append(m["#text"]) # Associated chemicals substances = [] - if 'ChemicalList' in mc: - chemicals = mc['ChemicalList']['Chemical'] + if "ChemicalList" in mc: + chemicals = mc["ChemicalList"]["Chemical"] if isinstance(chemicals, OrderedDict): chemicals = [chemicals] for chemical in chemicals: - substances.append(chemical['NameOfSubstance']['#text']) + substances.append(chemical["NameOfSubstance"]["#text"]) data_json = json.dumps(data) - sql = sql_template.format(edge_name, data_json, data['ref']) + sql = sql_template.format(edge_name, data_json, data["ref"]) self.execute(sql) if meshs: content_mesh = json.dumps(meshs) - sql_m = sql_update_mesh_terms.format(edge_name, content_mesh, data['ref']) + sql_m = sql_update_mesh_terms.format(edge_name, content_mesh, data["ref"]) self.execute(sql_m) if substances: content_substances = json.dumps(substances) - sql_s = sql_update_mesh_substances.format(edge_name, content_substances, data['ref']) + sql_s = sql_update_mesh_substances.format(edge_name, content_substances, data["ref"]) self.execute(sql_s) updated += 1 @@ -1125,23 +1196,32 @@ def _standardize_column_names(columns: Iterable[str]) -> List[str]: """ return [get_standard_name(x) for x in columns] - def _standardize_dataframe(self, dataframe: pd.DataFrame, - replace_nulls_with_nones: bool = True, - standardize_column_names: bool = True, - replace_minus_with_nones: bool = True) -> pd.DataFrame: + def _standardize_dataframe( + self, + dataframe: pd.DataFrame, + replace_nulls_with_nones: bool = True, + standardize_column_names: bool = True, + replace_minus_with_nones: bool = True, + ) -> pd.DataFrame: if standardize_column_names: dataframe.columns = self._standardize_column_names(dataframe.columns) if replace_nulls_with_nones: dataframe.replace({np.nan: None}, inplace=True) - if replace_minus_with_nones and any(dataframe.dtypes == 'object'): - dataframe.replace({'-': None}, inplace=True) + if replace_minus_with_nones and any(dataframe.dtypes == "object"): + dataframe.replace({"-": None}, inplace=True) return dataframe - def import_dataframe(self, dataframe: pd.DataFrame, class_name: str, replace_nulls_with_nones: bool = True, - standardize_column_names: bool = True, replace: bool = True, ) -> int: + def import_dataframe( + self, + dataframe: pd.DataFrame, + class_name: str, + replace_nulls_with_nones: bool = True, + standardize_column_names: bool = True, + replace: bool = True, + ) -> int: """Import dataframe into OrientDb class with name.""" sql_temp = f"insert into `{class_name}` content {{}}" inserted = 0 @@ -1151,22 +1231,32 @@ def import_dataframe(self, dataframe: pd.DataFrame, class_name: str, replace_nul dataframe = self._standardize_dataframe(dataframe, replace_nulls_with_nones, standardize_column_names) - for row in tqdm(dataframe.to_dict(orient='records'), desc=f"Insert {class_name.upper()} data"): + for row in tqdm( + dataframe.to_dict(orient="records"), + desc=f"Insert {class_name.upper()} data", + ): sql = sql_temp.format(json.dumps(row)) try: self.execute(sql) except PyOrientCommandException as pyorientdb_command_exception: - logging.error('OrientDB SQL error:', sql, pyorientdb_command_exception) + logging.error("OrientDB SQL error:", sql, pyorientdb_command_exception) inserted += 1 return inserted - def batch_insert(self, dataframe: pd.DataFrame, database: str, chunk_size: int = 100, desc: str = None, - standardize_column_names: bool = False, replace: bool = True, - replace_nulls_with_nones: bool = False) -> int: + def batch_insert( + self, + dataframe: pd.DataFrame, + database: str, + chunk_size: int = 100, + desc: str = None, + standardize_column_names: bool = False, + replace: bool = True, + replace_nulls_with_nones: bool = False, + ) -> int: """Adds rows of a dataframe into specified generic table in batches. Parameters @@ -1205,22 +1295,23 @@ def batch_insert(self, dataframe: pd.DataFrame, database: str, chunk_size: int = dataframe.replace({pd.np.nan: None}, inplace=True) for chunk in tqdm(chunks(dataframe.index, chunk_size), total=total, desc=desc): - - batch_cmds = ['begin'] + batch_cmds = ["begin"] for i in chunk: sql = sql_temp.format(database, dataframe.loc[i].to_json()) batch_cmds.append(sql) batch_cmds.append("commit retry 100") - cmd = ';'.join(batch_cmds) + cmd = ";".join(batch_cmds) self.client.batch(cmd) inserted += len(chunk) return inserted - def get_set_gene_rids_by_position(self, - chromosome: str, - position: int, - gene_types=['mapped', 'downstream', 'upstream']) -> Dict[str, List[str]]: + def get_set_gene_rids_by_position( + self, + chromosome: str, + position: int, + gene_types=["mapped", "downstream", "upstream"], + ) -> Dict[str, List[str]]: """Return dictionary of mapped gene by chromosal position. ALERT: creates new BEL HGNC gene is not exists. @@ -1228,21 +1319,27 @@ def get_set_gene_rids_by_position(self, gene_rids = defaultdict(list) sqls = dict() - sqls['mapped'] = f"""Select symbol + sqls[ + "mapped" + ] = f"""Select symbol from ensembl where start < {position} and stop > {position} and chromosome='{chromosome}' group by symbol""" - sqls['downstream'] = f"""Select symbol + sqls[ + "downstream" + ] = f"""Select symbol from ensembl where start > {position} and chromosome='{chromosome}' order by start limit 1""" - sqls['upstream'] = f"""Select symbol + sqls[ + "upstream" + ] = f"""Select symbol from ensembl where stop < {position} and @@ -1252,13 +1349,15 @@ def get_set_gene_rids_by_position(self, for gene_type, sql in sqls.items(): if gene_type in gene_types: results = self.engine.execute(sql) - for symbol, in results.fetchall(): + for (symbol,) in results.fetchall(): bel = f'g(HGNC:"{symbol}")' - data = {'name': symbol, - 'namespace': "HGNC", - 'bel': bel, - 'pure': True} - gene_rid = self.get_create_rid('gene', value_dict=data, check_for='bel') + data = { + "name": symbol, + "namespace": "HGNC", + "bel": bel, + "pure": True, + } + gene_rid = self.get_create_rid("gene", value_dict=data, check_for="bel") gene_rids[gene_type] += [gene_rid] return gene_rids @@ -1269,8 +1368,8 @@ def class_is_descendant_of(self, child_name: str, descendant_name: str) -> bool: parents_exists = self.execute(sql.format(child_name)) if parents_exists: data = parents_exists[0].oRecordData - if data and 'superClasses' in data: - parent_names = data['superClasses'] + if data and "superClasses" in data: + parent_names = data["superClasses"] if descendant_name in parent_names: return True @@ -1284,12 +1383,12 @@ def class_has_children(self, class_name) -> bool: sql = f"""Select count(name) from (select expand(classes) from metadata:schema) where '{class_name}' in superClasses""" - return self.execute(sql)[0].oRecordData['count'] > 0 + return self.execute(sql)[0].oRecordData["count"] > 0 def get_child_classes(self, class_name) -> List[str]: """Get list of child classes for given class_name.""" sql = f"Select name from (select expand(classes) from metadata:schema) where '{class_name}' in superClasses" - return [x.oRecordData['name'] for x in self.execute(sql)] + return [x.oRecordData["name"] for x in self.execute(sql)] def get_leaf_classes_of(self, class_name: str) -> List[str]: """Return list of children classes for the given class_name.""" @@ -1333,15 +1432,17 @@ def update_bel(self) -> None: def delete_nodes_with_no_edges(self, class_name=None) -> int: """Delete all nodes without any edges.""" if isinstance(class_name, str) and not self.class_exists(class_name): - wtext = f'You try to delete nodes with no edges from class {class_name}, ' \ - f'but node class {class_name} not exits.' + wtext = ( + f"You try to delete nodes with no edges from class {class_name}, " + f"but node class {class_name} not exits." + ) logger.warning(wtext) return 0 else: - class_name = class_name if class_name is not None else 'V' + class_name = class_name if class_name is not None else "V" return self.execute(f"Delete VERTEX {class_name} where both().size() = 0")[0] - def get_pure_symbol_rids_dict_in_bel_context(self, class_name='protein', namespace='HGNC') -> Dict[str, str]: + def get_pure_symbol_rids_dict_in_bel_context(self, class_name="protein", namespace="HGNC") -> Dict[str, str]: """Return dictionary with HGNC names as key and OrientDB @rid as value. Applies to all pure nodes in graph with class name directly or indirectly involved in BEL stmt. @@ -1351,35 +1452,43 @@ def get_pure_symbol_rids_dict_in_bel_context(self, class_name='protein', namespa # Following sql also includes any modification of proteins # all pure nodes with bel_relation - sql = "Select name as symbol, @rid.asString() as rid from " \ - + class_name + " where pure=true and namespace='" \ - + namespace + "' and name in (Select name from (match { class:" \ - + class_name + ", as: p, where:(namespace='" + namespace + \ - "') }.(bothE('bel_relation'){ where: (document IS NOT NULL) }) return distinct(p.name) as name))" - return {r['symbol']: r['rid'] for r in self.query_get_dict(sql)} + sql = ( + "Select name as symbol, @rid.asString() as rid from " + + class_name + + " where pure=true and namespace='" + + namespace + + "' and name in (Select name from (match { class:" + + class_name + + ", as: p, where:(namespace='" + + namespace + + "') }.(bothE('bel_relation'){ where: (document IS NOT NULL) }) return distinct(p.name) as name))" + ) + return {r["symbol"]: r["rid"] for r in self.query_get_dict(sql)} def get_pure_uniprots_in_bel_context(self) -> Set[str]: """Returns a list of all uniprot accessions in BEL annotation context.""" - sql = "match {class:protein, where:(uniprot IS NOT NULL), as:p}.(bothE('bel_relation')" \ - "{class: bel_relation,where:(document IS NOT NULL)}) return distinct(p.uniprot) as uniprot" - return {x['uniprot'] for x in self.query_get_dict(sql)} + sql = ( + "match {class:protein, where:(uniprot IS NOT NULL), as:p}.(bothE('bel_relation')" + "{class: bel_relation,where:(document IS NOT NULL)}) return distinct(p.uniprot) as uniprot" + ) + return {x["uniprot"] for x in self.query_get_dict(sql)} - def get_pure_symbol_rid_df_in_bel_context(self, class_name='protein', namespace='HGNC') -> pd.DataFrame: + def get_pure_symbol_rid_df_in_bel_context(self, class_name="protein", namespace="HGNC") -> pd.DataFrame: """Return dictionary with gene symbols as keys and node rIDs as values.""" r = self.get_pure_symbol_rids_dict_in_bel_context(class_name=class_name, namespace=namespace) - return pd.DataFrame(r.items(), columns=['symbol', 'rid']) + return pd.DataFrame(r.items(), columns=["symbol", "rid"]) - def get_pure_symbol_rids_dict(self, class_name='protein', namespace='HGNC') -> Dict[str, str]: + def get_pure_symbol_rids_dict(self, class_name="protein", namespace="HGNC") -> Dict[str, str]: """Return dictionary with protein name as keys and node rIDs as values.""" results = self.query_class(class_name, pure=True, namespace=namespace) - return {r['name']: r['rid'] for r in results} + return {r["name"]: r["rid"] for r in results} def get_pure_rid_by_uniprot(self, uniprot: str): """Get rIDs of node based on UniProt ID.""" sql = f"Select @rid.asString() as rid from protein where pure = true and uniprot='{uniprot}' limit 1" results = self.query_get_dict(sql) if results: - return results[0]['rid'] + return results[0]["rid"] def get_pure_uniprot_rid_dict_in_bel_context(self) -> Dict[str, str]: """Return dictionary with UniProt accession id as key and OrientDB @rid as value. @@ -1394,10 +1503,10 @@ def get_pure_uniprot_rid_dict_in_bel_context(self) -> Dict[str, str]: uniprot_list from bel_relation where document IS NOT NULL and (in.uniprot IS NOT NULL or out.uniprot IS NOT NULL)))""" - return {r['uniprot']: r['rid'] for r in self.query_get_dict(sql)} + return {r["uniprot"]: r["rid"] for r in self.query_get_dict(sql)} def get_pure_uniprot_rids_dict(self): """Return dictionary with UniProt IDs as keys and node rIDs as values.""" sql = "Select uniprot, @rid.asString() as rid from protein where uniprot IS NOT NULL and pure=true" results = self.query_get_dict(sql) - return {r['uniprot']: r['rid'] for r in results} + return {r["uniprot"]: r["rid"] for r in results} diff --git a/ebel/manager/orientdb/odb_structure.py b/ebel/manager/orientdb/odb_structure.py index 9439f59..a0fceb3 100755 --- a/ebel/manager/orientdb/odb_structure.py +++ b/ebel/manager/orientdb/odb_structure.py @@ -7,31 +7,33 @@ """ from copy import deepcopy from enum import Enum -from typing import List, Dict, Optional, Tuple +from typing import Dict, List, Optional, Tuple -from ebel.manager.orientdb.odb_defaults import OIndexType, ODataType -from ebel.manager.constants import normalized_pmod +from ebel.manager.orientdb.odb_defaults import (ODataType, OIndexType, + normalized_pmod) class OClassType(Enum): """Class for constants definitions.""" - NODE = 'VERTEX' - EDGE = 'EDGE' - GENERIC = 'GENERIC' + NODE = "VERTEX" + EDGE = "EDGE" + GENERIC = "GENERIC" class OProperty(object): """Generic class definition for creating properties in the OrientDB database.""" - def __init__(self, - prop_name: str, - data_type: ODataType, - linked_class: Optional[str] = None, - linked_type: Optional[ODataType] = None, - mandatory: bool = False, - node_view_label=False, - node_view_sub_label=False): + def __init__( + self, + prop_name: str, + data_type: ODataType, + linked_class: Optional[str] = None, + linked_type: Optional[ODataType] = None, + mandatory: bool = False, + node_view_label=False, + node_view_sub_label=False, + ): """Init method.""" self.prop_name = prop_name self.data_type = data_type @@ -45,13 +47,15 @@ def __init__(self, class OClass(object): """Generic class definition for creating classes in the OrientDB database.""" - def __init__(self, - name: str, - extends: Tuple[str, ...], - abstract: bool, - props: Tuple[OProperty, ...], - own_class: bool, - class_type: OClassType): + def __init__( + self, + name: str, + extends: Tuple[str, ...], + abstract: bool, + props: Tuple[OProperty, ...], + own_class: bool, + class_type: OClassType, + ): """Init method.""" self.class_type = class_type self.name = name @@ -86,12 +90,14 @@ def is_generic(self): class Node(OClass): """Generic class definition for creating node classes in the OrientDB database.""" - def __init__(self, - name: str, - extends: Tuple[OClass, ...] = (), - abstract: bool = False, - props: Tuple[OProperty, ...] = (), - own_class: bool = True): + def __init__( + self, + name: str, + extends: Tuple[OClass, ...] = (), + abstract: bool = False, + props: Tuple[OProperty, ...] = (), + own_class: bool = True, + ): """Init method for Node ODB class.""" extends = tuple(x.name for x in extends) OClass.__init__(self, name, extends, abstract, props, own_class, class_type=OClassType.NODE) @@ -100,13 +106,15 @@ def __init__(self, class Edge(OClass): """Generic class definition for creating edge classes in the OrientDB database.""" - def __init__(self, - name: str, - extends: Tuple[OClass, ...] = (), - abstract: bool = False, - props: Tuple[OProperty, ...] = (), - own_class: bool = True, - in_out: Tuple[Optional[OClass], Optional[OClass]] = (None, None)): + def __init__( + self, + name: str, + extends: Tuple[OClass, ...] = (), + abstract: bool = False, + props: Tuple[OProperty, ...] = (), + own_class: bool = True, + in_out: Tuple[Optional[OClass], Optional[OClass]] = (None, None), + ): """Init method for Edge ODB class.""" extends = tuple(x.name for x in extends) in_ = in_out[0].name if in_out[0] else None @@ -118,14 +126,24 @@ def __init__(self, class Generic(OClass): """Generic class definition for creating generic classes in the OrientDB database.""" - def __init__(self, - name: str, - extends: Tuple[str, ...] = (), - abstract: bool = False, - props: Tuple[OProperty, ...] = (), - own_class: bool = True): + def __init__( + self, + name: str, + extends: Tuple[str, ...] = (), + abstract: bool = False, + props: Tuple[OProperty, ...] = (), + own_class: bool = True, + ): """Init method for Generic ODB class.""" - OClass.__init__(self, name, extends, abstract, props, own_class, class_type=OClassType.GENERIC) + OClass.__init__( + self, + name, + extends, + abstract, + props, + own_class, + class_type=OClassType.GENERIC, + ) class OIndex(object): @@ -138,106 +156,153 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd self.index_type = index_type -basic_node = Node(name='V') -basic_edge = Edge(name='E') +basic_node = Node(name="V") +basic_edge = Edge(name="E") ############################################################################## # Definition of BEL vertices, edges and indices ############################################################################## -bel_document = Generic('bel_document', props=( - OProperty('name', ODataType.STRING), - OProperty('date_uploaded', ODataType.DATETIME), - OProperty('description', ODataType.STRING), - OProperty('version', ODataType.STRING), - OProperty('authors', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('contact_info', ODataType.STRING), - OProperty('copyright', ODataType.STRING), - OProperty('licences', ODataType.STRING), - OProperty('git', ODataType.EMBEDDEDMAP), - OProperty('namespaces', ODataType.EMBEDDEDMAP), - OProperty('annotations', ODataType.EMBEDDEDMAP), - OProperty('file', ODataType.EMBEDDEDMAP), - OProperty('git_info', ODataType.EMBEDDEDMAP), - OProperty('keywords', ODataType.LINKSET, 'keyword'), -)) +bel_document = Generic( + "bel_document", + props=( + OProperty("name", ODataType.STRING), + OProperty("date_uploaded", ODataType.DATETIME), + OProperty("description", ODataType.STRING), + OProperty("version", ODataType.STRING), + OProperty("authors", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("contact_info", ODataType.STRING), + OProperty("copyright", ODataType.STRING), + OProperty("licences", ODataType.STRING), + OProperty("git", ODataType.EMBEDDEDMAP), + OProperty("namespaces", ODataType.EMBEDDEDMAP), + OProperty("annotations", ODataType.EMBEDDEDMAP), + OProperty("file", ODataType.EMBEDDEDMAP), + OProperty("git_info", ODataType.EMBEDDEDMAP), + OProperty("keywords", ODataType.LINKSET, "keyword"), + ), +) bel_generics: Tuple[Generic, ...] = ( - Generic('keyword', props=( - OProperty('label', ODataType.STRING), - OProperty('description', ODataType.STRING), - )), + Generic( + "keyword", + props=( + OProperty("label", ODataType.STRING), + OProperty("description", ODataType.STRING), + ), + ), bel_document, ) -namespace_name = Node('nn', (basic_node,), abstract=True, props=( - OProperty('namespace', ODataType.STRING), - OProperty('name', ODataType.STRING), -)) -bel = Node('bel', (basic_node,), abstract=True, props=( - OProperty('bel', ODataType.STRING, mandatory=True, node_view_sub_label=True), - OProperty('label', ODataType.STRING), - OProperty('involved_genes', ODataType.EMBEDDEDSET, linked_type=ODataType.STRING, node_view_label=True), - OProperty('involved_other', ODataType.EMBEDDEDSET, linked_type=ODataType.STRING, node_view_label=True), - OProperty('suggested_corrections', ODataType.EMBEDDEDMAP, linked_type=ODataType.STRING), -)) -bio_object = Node('bio_object', (bel,), abstract=True, props=( - OProperty('chebi', ODataType.INTEGER), -)) -pure_object = Node('pure_object', (basic_node,), abstract=True, props=( - OProperty('pure', ODataType.BOOLEAN), - OProperty('species', ODataType.INTEGER), -)) -genetic_flow = Node('genetic_flow', (bio_object, namespace_name, pure_object), abstract=True) -bio_concept = Node('bio_concept', (bel,), abstract=True) -location_object = Node('location_object', (basic_node,), abstract=True, props=( - OProperty('location', ODataType.EMBEDDEDMAP), -)) -bio_act = Node('bio_act', (bel,)) -bio_list = Node('bio_list', (bel,), abstract=True) -ebel = Node('ebel', (basic_node,), abstract=True, props=( - OProperty('bel', ODataType.STRING, node_view_sub_label=True), - OProperty('name', ODataType.STRING, node_view_label=True), - OProperty('namespace', ODataType.STRING), - OProperty('label', ODataType.STRING), -)) -protein = Node('protein', (genetic_flow, location_object), props=( - OProperty('uniprot', ODataType.STRING), -)) -gene = Node('gene', (genetic_flow, location_object)) -rna = Node('rna', (genetic_flow, location_object)) -abundance = Node('abundance', (bio_object, namespace_name, pure_object)) -population = Node('population', (bio_object, namespace_name, pure_object)) -from_location = Node('from_location', (ebel,)) -to_location = Node('to_location', (ebel,)) -biological_process = Node('biological_process', (bio_concept, namespace_name)) -pathology = Node('pathology', (bio_concept, namespace_name)) -complex_ = Node('complex', (bio_object, namespace_name, pure_object)) -micro_rna = Node('micro_rna', (bio_object, namespace_name, location_object)) -activity = Node('activity', (bio_act, namespace_name), props=( - OProperty('default', ODataType.STRING), -)) -reaction = Node('reaction', (bio_act,)) -degradation = Node('degradation', (bio_act,)) -cell_secretion = Node('cell_secretion', (bio_act,)) -translocation = Node('translocation', (bio_act,)) -cell_surface_expression = Node('cell_surface_expression', (bio_act,)) -list_ = Node('list', (bio_list,)) -composite = Node('composite', (bio_list,)) -variant = Node('variant', (ebel,), props=( - OProperty('hgvs', ODataType.STRING, node_view_label=True), -)) -fragment = Node('fragment', (ebel,)) -location = Node('location', (ebel,)) -pmod = Node('pmod', (ebel,), props=( - OProperty('position', ODataType.INTEGER, node_view_label=True), - OProperty('type', ODataType.STRING, node_view_label=True), - OProperty('amino_acid', ODataType.STRING, node_view_label=True), -)) -gmod = Node('gmod', (ebel,)) -reactants = Node('reactants', (ebel,)) -products = Node('products', (ebel,)) -fusion_protein = Node('fusion_protein', (bel,)) -fusion_rna = Node('fusion_rna', (bel,)) -fusion_gene = Node('fusion_gene', (bel,)) +namespace_name = Node( + "nn", + (basic_node,), + abstract=True, + props=( + OProperty("namespace", ODataType.STRING), + OProperty("name", ODataType.STRING), + ), +) +bel = Node( + "bel", + (basic_node,), + abstract=True, + props=( + OProperty("bel", ODataType.STRING, mandatory=True, node_view_sub_label=True), + OProperty("label", ODataType.STRING), + OProperty( + "involved_genes", + ODataType.EMBEDDEDSET, + linked_type=ODataType.STRING, + node_view_label=True, + ), + OProperty( + "involved_other", + ODataType.EMBEDDEDSET, + linked_type=ODataType.STRING, + node_view_label=True, + ), + OProperty("suggested_corrections", ODataType.EMBEDDEDMAP, linked_type=ODataType.STRING), + ), +) +bio_object = Node("bio_object", (bel,), abstract=True, props=(OProperty("chebi", ODataType.INTEGER),)) +pure_object = Node( + "pure_object", + (basic_node,), + abstract=True, + props=( + OProperty("pure", ODataType.BOOLEAN), + OProperty("species", ODataType.INTEGER), + ), +) +genetic_flow = Node("genetic_flow", (bio_object, namespace_name, pure_object), abstract=True) +bio_concept = Node("bio_concept", (bel,), abstract=True) +location_object = Node( + "location_object", + (basic_node,), + abstract=True, + props=(OProperty("location", ODataType.EMBEDDEDMAP),), +) +bio_act = Node("bio_act", (bel,)) +bio_list = Node("bio_list", (bel,), abstract=True) +ebel = Node( + "ebel", + (basic_node,), + abstract=True, + props=( + OProperty("bel", ODataType.STRING, node_view_sub_label=True), + OProperty("name", ODataType.STRING, node_view_label=True), + OProperty("namespace", ODataType.STRING), + OProperty("label", ODataType.STRING), + ), +) +protein = Node( + "protein", + (genetic_flow, location_object), + props=(OProperty("uniprot", ODataType.STRING),), +) +gene = Node("gene", (genetic_flow, location_object)) +rna = Node("rna", (genetic_flow, location_object)) +abundance = Node("abundance", (bio_object, namespace_name, pure_object)) +population = Node("population", (bio_object, namespace_name, pure_object)) +from_location = Node("from_location", (ebel,)) +to_location = Node("to_location", (ebel,)) +biological_process = Node("biological_process", (bio_concept, namespace_name)) +pathology = Node("pathology", (bio_concept, namespace_name)) +complex_ = Node("complex", (bio_object, namespace_name, pure_object)) +micro_rna = Node("micro_rna", (bio_object, namespace_name, location_object)) +activity = Node( + "activity", + (bio_act, namespace_name), + props=(OProperty("default", ODataType.STRING),), +) +reaction = Node("reaction", (bio_act,)) +degradation = Node("degradation", (bio_act,)) +cell_secretion = Node("cell_secretion", (bio_act,)) +translocation = Node("translocation", (bio_act,)) +cell_surface_expression = Node("cell_surface_expression", (bio_act,)) +list_ = Node("list", (bio_list,)) +composite = Node("composite", (bio_list,)) +variant = Node( + "variant", + (ebel,), + props=(OProperty("hgvs", ODataType.STRING, node_view_label=True),), +) +fragment = Node("fragment", (ebel,)) +location = Node("location", (ebel,)) +pmod = Node( + "pmod", + (ebel,), + props=( + OProperty("position", ODataType.INTEGER, node_view_label=True), + OProperty("type", ODataType.STRING, node_view_label=True), + OProperty("amino_acid", ODataType.STRING, node_view_label=True), + ), +) +gmod = Node("gmod", (ebel,)) +reactants = Node("reactants", (ebel,)) +products = Node("products", (ebel,)) +fusion_protein = Node("fusion_protein", (bel,)) +fusion_rna = Node("fusion_rna", (bel,)) +fusion_gene = Node("fusion_gene", (bel,)) bel_nodes: Tuple[Node, ...] = ( bel, @@ -281,153 +346,159 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd fusion_gene, ) -bel_relation = Edge('bel_relation', (basic_edge,), abstract=True, props=( - OProperty("evidence", ODataType.STRING), - OProperty("pmid", ODataType.INTEGER), - OProperty("pmc", ODataType.STRING), - OProperty("citation", ODataType.EMBEDDEDMAP), - OProperty("annotation", ODataType.EMBEDDEDMAP), - OProperty("document", ODataType.LINKSET, linked_class=bel_document.name))) -ebel_relation = Edge('ebel_relation', (basic_edge,), abstract=True) -causal = Edge('causal', (bel_relation,), abstract=True, in_out=(bel, bel)) -correlative = Edge('correlative', (bel_relation,), abstract=True, in_out=(bel, bel)) -genomic = Edge('genomic', (bel_relation,), abstract=True) -other = Edge('other', (bel_relation,), abstract=True) -deprecated = Edge('deprecated', (bel_relation,), abstract=True) -compiler = Edge('compiler', (bel_relation,), abstract=True) -has_modified = Edge('has_modified', (ebel_relation,), abstract=True) -has_variant_obj = Edge('has_variant_obj', (ebel_relation,), abstract=True) -has_located = Edge('has_located', (ebel_relation,), abstract=True) -has_ppi = Edge('has_ppi', (ebel_relation,), abstract=True, own_class=False, in_out=(protein, protein)) +bel_relation = Edge( + "bel_relation", + (basic_edge,), + abstract=True, + props=( + OProperty("evidence", ODataType.STRING), + OProperty("pmid", ODataType.INTEGER), + OProperty("pmc", ODataType.STRING), + OProperty("citation", ODataType.EMBEDDEDMAP), + OProperty("annotation", ODataType.EMBEDDEDMAP), + OProperty("document", ODataType.LINKSET, linked_class=bel_document.name), + ), +) +ebel_relation = Edge("ebel_relation", (basic_edge,), abstract=True) +causal = Edge("causal", (bel_relation,), abstract=True, in_out=(bel, bel)) +correlative = Edge("correlative", (bel_relation,), abstract=True, in_out=(bel, bel)) +genomic = Edge("genomic", (bel_relation,), abstract=True) +other = Edge("other", (bel_relation,), abstract=True) +deprecated = Edge("deprecated", (bel_relation,), abstract=True) +compiler = Edge("compiler", (bel_relation,), abstract=True) +has_modified = Edge("has_modified", (ebel_relation,), abstract=True) +has_variant_obj = Edge("has_variant_obj", (ebel_relation,), abstract=True) +has_located = Edge("has_located", (ebel_relation,), abstract=True) +has_ppi = Edge( + "has_ppi", + (ebel_relation,), + abstract=True, + own_class=False, + in_out=(protein, protein), +) bel_edges: Tuple[Edge, ...] = ( bel_relation, causal, - Edge('increases', (causal,)), - Edge('directly_increases', (causal,)), - Edge('decreases', (causal,)), - Edge('directly_decreases', (causal,)), - Edge('rate_limiting_step_of', (causal,)), - Edge('causes_no_change', (causal,)), - Edge('regulates', (causal,)), - + Edge("increases", (causal,)), + Edge("directly_increases", (causal,)), + Edge("decreases", (causal,)), + Edge("directly_decreases", (causal,)), + Edge("rate_limiting_step_of", (causal,)), + Edge("causes_no_change", (causal,)), + Edge("regulates", (causal,)), correlative, - Edge('negative_correlation', (correlative,)), - Edge('positive_correlation', (correlative,)), - Edge('association', (correlative,)), - Edge('no_correlation', (correlative,)), - + Edge("negative_correlation", (correlative,)), + Edge("positive_correlation", (correlative,)), + Edge("association", (correlative,)), + Edge("no_correlation", (correlative,)), genomic, - Edge('orthologous', (genomic,), in_out=(bel, bel)), - Edge('transcribed_to', (genomic,), in_out=(rna, gene)), - Edge('translated_to', (genomic,), in_out=(protein, rna)), - + Edge("orthologous", (genomic,), in_out=(bel, bel)), + Edge("transcribed_to", (genomic,), in_out=(rna, gene)), + Edge("translated_to", (genomic,), in_out=(protein, rna)), other, - Edge('has_member', (other,), in_out=(bel, bel)), - Edge('has_members', (other,), in_out=(bel, bel)), - Edge('has_component', (other,), in_out=(bel, bel)), - Edge('has_components', (other,), in_out=(bel, bel)), - Edge('equivalent_to', (other,), in_out=(bel, bel)), - Edge('is_a', (other,), in_out=(bel, bel)), - Edge('sub_process_of', (other,), in_out=(bel, bel)), - + Edge("has_member", (other,), in_out=(bel, bel)), + Edge("has_members", (other,), in_out=(bel, bel)), + Edge("has_component", (other,), in_out=(bel, bel)), + Edge("has_components", (other,), in_out=(bel, bel)), + Edge("equivalent_to", (other,), in_out=(bel, bel)), + Edge("is_a", (other,), in_out=(bel, bel)), + Edge("sub_process_of", (other,), in_out=(bel, bel)), deprecated, - Edge('analogous_to', (deprecated,), in_out=(bel, bel)), - Edge('biomarker_for', (deprecated,), in_out=(bel, bel)), - Edge('prognostic_biomarker_for', (deprecated,)), - + Edge("analogous_to", (deprecated,), in_out=(bel, bel)), + Edge("biomarker_for", (deprecated,), in_out=(bel, bel)), + Edge("prognostic_biomarker_for", (deprecated,)), compiler, - Edge('acts_in', (compiler,), in_out=(bel, bel)), - Edge('has_product', (compiler,), in_out=(bel, bel)), - Edge('has_variant', (compiler,), in_out=(bel, bel)), - Edge('has_modification', (compiler,), in_out=(bel, bel)), - Edge('reactant_in', (compiler,), in_out=(bel, bel)), - Edge('translocates', (compiler,), in_out=(bel, bel)), - Edge('includes', (compiler,), in_out=(bel, bel)), - + Edge("acts_in", (compiler,), in_out=(bel, bel)), + Edge("has_product", (compiler,), in_out=(bel, bel)), + Edge("has_variant", (compiler,), in_out=(bel, bel)), + Edge("has_modification", (compiler,), in_out=(bel, bel)), + Edge("reactant_in", (compiler,), in_out=(bel, bel)), + Edge("translocates", (compiler,), in_out=(bel, bel)), + Edge("includes", (compiler,), in_out=(bel, bel)), ebel_relation, # TODO: check if always basic_node is needed or if we can refine that - Edge('has__protein', (ebel_relation,), in_out=(protein, basic_node)), - Edge('has__rna', (ebel_relation,), in_out=(rna, basic_node)), - Edge('has__gene', (ebel_relation,), in_out=(gene, basic_node)), - Edge('has__abundance', (ebel_relation,), in_out=(abundance, basic_node)), - Edge('has__population', (ebel_relation,), in_out=(population, basic_node)), - Edge('has__location', (ebel_relation,), in_out=(ebel, basic_node)), - Edge('has__from_location', (ebel_relation,), in_out=(from_location, basic_node)), - Edge('has__to_location', (ebel_relation,), in_out=(to_location, basic_node)), - Edge('has__fragment', (ebel_relation,), in_out=(fragment, basic_node)), - Edge('has__pmod', (ebel_relation,), in_out=(pmod, basic_node)), - Edge('has__gmod', (ebel_relation,), in_out=(gmod, basic_node)), - Edge('has__complex', (ebel_relation,), in_out=(complex_, basic_node)), - Edge('has__micro_rna', (ebel_relation,), in_out=(micro_rna, basic_node)), - Edge('has__variant', (ebel_relation,), in_out=(variant, basic_node)), - Edge('has__reactants', (ebel_relation,), in_out=(reactants, basic_node)), - Edge('has__products', (ebel_relation,), in_out=(products, basic_node)), - Edge('has__composite', (ebel_relation,), in_out=(composite, basic_node)), - - Edge('has_fragmented_protein', (ebel_relation,), in_out=(bel, bel)), - + Edge("has__protein", (ebel_relation,), in_out=(protein, basic_node)), + Edge("has__rna", (ebel_relation,), in_out=(rna, basic_node)), + Edge("has__gene", (ebel_relation,), in_out=(gene, basic_node)), + Edge("has__abundance", (ebel_relation,), in_out=(abundance, basic_node)), + Edge("has__population", (ebel_relation,), in_out=(population, basic_node)), + Edge("has__location", (ebel_relation,), in_out=(ebel, basic_node)), + Edge("has__from_location", (ebel_relation,), in_out=(from_location, basic_node)), + Edge("has__to_location", (ebel_relation,), in_out=(to_location, basic_node)), + Edge("has__fragment", (ebel_relation,), in_out=(fragment, basic_node)), + Edge("has__pmod", (ebel_relation,), in_out=(pmod, basic_node)), + Edge("has__gmod", (ebel_relation,), in_out=(gmod, basic_node)), + Edge("has__complex", (ebel_relation,), in_out=(complex_, basic_node)), + Edge("has__micro_rna", (ebel_relation,), in_out=(micro_rna, basic_node)), + Edge("has__variant", (ebel_relation,), in_out=(variant, basic_node)), + Edge("has__reactants", (ebel_relation,), in_out=(reactants, basic_node)), + Edge("has__products", (ebel_relation,), in_out=(products, basic_node)), + Edge("has__composite", (ebel_relation,), in_out=(composite, basic_node)), + Edge("has_fragmented_protein", (ebel_relation,), in_out=(bel, bel)), has_modified, - Edge('has_modified_protein', (has_modified,), in_out=(protein, protein)), - Edge('has_modified_gene', (has_modified,), in_out=(gene, gene)), - + Edge("has_modified_protein", (has_modified,), in_out=(protein, protein)), + Edge("has_modified_gene", (has_modified,), in_out=(gene, gene)), has_variant_obj, - Edge('has_variant_gene', (has_variant_obj,), in_out=(gene, gene)), - Edge('has_variant_rna', (has_variant_obj,), in_out=(rna, rna)), - Edge('has_variant_protein', (has_variant_obj,), in_out=(protein, protein)), - Edge('has_variant_micro_rna', (has_variant_obj,), in_out=(micro_rna, micro_rna)), - + Edge("has_variant_gene", (has_variant_obj,), in_out=(gene, gene)), + Edge("has_variant_rna", (has_variant_obj,), in_out=(rna, rna)), + Edge("has_variant_protein", (has_variant_obj,), in_out=(protein, protein)), + Edge("has_variant_micro_rna", (has_variant_obj,), in_out=(micro_rna, micro_rna)), has_located, - Edge('has_located_gene', (has_located,), in_out=(gene, gene)), - Edge('has_located_rna', (has_located,), in_out=(rna, rna)), - Edge('has_located_protein', (has_located,), in_out=(protein, protein), props=( - OProperty('levels', ODataType.EMBEDDEDMAP), - )), - Edge('has_located_micro_rna', (has_located,), in_out=(micro_rna, micro_rna)), - Edge('has_located_complex', (has_located,), in_out=(complex_, complex_)), - Edge('has_located_abundance', (has_located,), in_out=(abundance, abundance)), - Edge('has_located_population', (has_located,), in_out=(population, population)), - Edge('pathway_interaction', (ebel_relation,), abstract=True), - + Edge("has_located_gene", (has_located,), in_out=(gene, gene)), + Edge("has_located_rna", (has_located,), in_out=(rna, rna)), + Edge( + "has_located_protein", + (has_located,), + in_out=(protein, protein), + props=(OProperty("levels", ODataType.EMBEDDEDMAP),), + ), + Edge("has_located_micro_rna", (has_located,), in_out=(micro_rna, micro_rna)), + Edge("has_located_complex", (has_located,), in_out=(complex_, complex_)), + Edge("has_located_abundance", (has_located,), in_out=(abundance, abundance)), + Edge("has_located_population", (has_located,), in_out=(population, population)), + Edge("pathway_interaction", (ebel_relation,), abstract=True), has_ppi, ) bel_indices = ( - OIndex(bel, ('bel',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel, ('involved_genes',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel, ('involved_other',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel_relation, ('evidence',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(protein, ('uniprot',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(bel_relation, ('annotation',), OIndexType.DICTIONARY), - OIndex(bel_relation, ('citation',), OIndexType.DICTIONARY), - OIndex(bel_relation, ('pmid',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(namespace_name, ('namespace', 'name'), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(namespace_name, ('name',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(namespace_name, ('namespace',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(genetic_flow, ('pure',), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(bel, ("bel",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(bel, ("involved_genes",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(bel, ("involved_other",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(bel_relation, ("evidence",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(protein, ("uniprot",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(bel_relation, ("annotation",), OIndexType.DICTIONARY), + OIndex(bel_relation, ("citation",), OIndexType.DICTIONARY), + OIndex(bel_relation, ("pmid",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(namespace_name, ("namespace", "name"), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(namespace_name, ("name",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(namespace_name, ("namespace",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(genetic_flow, ("pure",), OIndexType.NOTUNIQUE_HASH_INDEX), ) ############################################################################## # Definition of IntAct vertices, edges and indices ############################################################################## -has_ppi_ia = Edge('has_ppi_ia', (has_ppi,), props=( - OProperty("interaction_type", ODataType.STRING), - OProperty("interaction_type_psimi_id", ODataType.STRING), - OProperty("confidence_value", ODataType.FLOAT), - OProperty("detection_method", ODataType.STRING), - OProperty("detection_method_psimi_id", ODataType.INTEGER), - OProperty("pmid", ODataType.INTEGER), - OProperty("interaction_ids", ODataType.EMBEDDEDMAP), -)) +has_ppi_ia = Edge( + "has_ppi_ia", + (has_ppi,), + props=( + OProperty("interaction_type", ODataType.STRING), + OProperty("interaction_type_psimi_id", ODataType.STRING), + OProperty("confidence_value", ODataType.FLOAT), + OProperty("detection_method", ODataType.STRING), + OProperty("detection_method_psimi_id", ODataType.INTEGER), + OProperty("pmid", ODataType.INTEGER), + OProperty("interaction_ids", ODataType.EMBEDDEDMAP), + ), +) intact_edges: Tuple[Edge, ...] = ( has_ppi, has_ppi_ia, ) -intact_indices = ( - OIndex(has_ppi_ia, ('detection_method', 'interaction_type'), OIndexType.NOTUNIQUE), -) +intact_indices = (OIndex(has_ppi_ia, ("detection_method", "interaction_type"), OIndexType.NOTUNIQUE),) ############################################################################## # Definition of HGNC generics and indices @@ -435,68 +506,69 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd # See .urls.HGNC_JSON # https://www.genenames.org/help/download ############################################################################## -hgnc = Generic('hgnc', props=( - OProperty('id', ODataType.STRING), - OProperty('version', ODataType.INTEGER), - OProperty('alias_name', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('alias_symbol', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('bioparadigms_slc', ODataType.STRING), - OProperty('ccds_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('cd', ODataType.STRING), - OProperty('cosmic', ODataType.STRING), - OProperty('date_approved_reserved', ODataType.DATE), - OProperty('date_modified', ODataType.DATE), - OProperty('date_name_changed', ODataType.DATE), - OProperty('date_symbol_changed', ODataType.DATE), - OProperty('ena', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('ensembl_gene_id', ODataType.STRING), - OProperty('entrez_id', ODataType.INTEGER), - OProperty('enzyme_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('gene_family', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('gene_family_id', ODataType.EMBEDDEDLIST), - OProperty('homeodb', ODataType.STRING), - OProperty('horde_id', ODataType.STRING), - OProperty('imgt', ODataType.STRING), - OProperty('intermediate_filament_db', ODataType.STRING), - OProperty('iuphar', ODataType.STRING), - OProperty('kznf_gene_catalog', ODataType.INTEGER), - OProperty('lncipedia', ODataType.STRING), - OProperty('lncrnadb', ODataType.STRING), - OProperty('location', ODataType.STRING), - OProperty('location_sortable', ODataType.STRING), - OProperty('locus_group', ODataType.STRING), - OProperty('locus_type', ODataType.STRING), - OProperty('lsdb', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('mamit_trnadb', ODataType.INTEGER), - OProperty('merops', ODataType.STRING), - OProperty('mgd_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('mirbase', ODataType.STRING), - OProperty('name', ODataType.STRING), - OProperty('omim_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('orphanet', ODataType.INTEGER), - OProperty('prev_name', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('prev_symbol', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('pseudogene_org', ODataType.STRING), - OProperty('pubmed_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.INTEGER), - OProperty('refseq_accession', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('rgd_id', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('rna_central_ids', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('snornabase', ODataType.STRING), - OProperty('status', ODataType.STRING), - OProperty('symbol', ODataType.STRING), - OProperty('ucsc_id', ODataType.STRING), - OProperty('uniprot_ids', ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), - OProperty('uuid', ODataType.STRING), - OProperty('vega_id', ODataType.STRING))) +hgnc = Generic( + "hgnc", + props=( + OProperty("id", ODataType.STRING), + OProperty("version", ODataType.INTEGER), + OProperty("alias_name", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("alias_symbol", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("bioparadigms_slc", ODataType.STRING), + OProperty("ccds_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("cd", ODataType.STRING), + OProperty("cosmic", ODataType.STRING), + OProperty("date_approved_reserved", ODataType.DATE), + OProperty("date_modified", ODataType.DATE), + OProperty("date_name_changed", ODataType.DATE), + OProperty("date_symbol_changed", ODataType.DATE), + OProperty("ena", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("ensembl_gene_id", ODataType.STRING), + OProperty("entrez_id", ODataType.INTEGER), + OProperty("enzyme_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("gene_family", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("gene_family_id", ODataType.EMBEDDEDLIST), + OProperty("homeodb", ODataType.STRING), + OProperty("horde_id", ODataType.STRING), + OProperty("imgt", ODataType.STRING), + OProperty("iuphar", ODataType.STRING), + OProperty("kznf_gene_catalog", ODataType.INTEGER), + OProperty("lncipedia", ODataType.STRING), + OProperty("lncrnadb", ODataType.STRING), + OProperty("location", ODataType.STRING), + OProperty("location_sortable", ODataType.STRING), + OProperty("locus_group", ODataType.STRING), + OProperty("locus_type", ODataType.STRING), + OProperty("lsdb", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("merops", ODataType.STRING), + OProperty("mgd_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("mirbase", ODataType.STRING), + OProperty("name", ODataType.STRING), + OProperty("omim_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("orphanet", ODataType.INTEGER), + OProperty("prev_name", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("prev_symbol", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("pubmed_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.INTEGER), + OProperty("refseq_accession", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("rgd_id", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("rna_central_ids", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("snornabase", ODataType.STRING), + OProperty("status", ODataType.STRING), + OProperty("symbol", ODataType.STRING), + OProperty("ucsc_id", ODataType.STRING), + OProperty("uniprot_ids", ODataType.EMBEDDEDLIST, linked_type=ODataType.STRING), + OProperty("uuid", ODataType.STRING), + OProperty("vega_id", ODataType.STRING), + ), +) hgnc_generics: Tuple[Generic, ...] = (hgnc,) -hgnc_nodes: Tuple[Node, ...] = (deepcopy(genetic_flow).add_properties([OProperty('hgnc', ODataType.LINK, 'hgnc')]),) +hgnc_nodes: Tuple[Node, ...] = (deepcopy(genetic_flow).add_properties([OProperty("hgnc", ODataType.LINK, "hgnc")]),) hgnc_indices = ( - OIndex(hgnc, ('symbol',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(hgnc, ('id',), OIndexType.UNIQUE_HASH_INDEX), - OIndex(hgnc, ('ensembl_gene_id',), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(hgnc, ("symbol",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(hgnc, ("id",), OIndexType.UNIQUE_HASH_INDEX), + OIndex(hgnc, ("ensembl_gene_id",), OIndexType.NOTUNIQUE_HASH_INDEX), ) ############################################################################## @@ -504,29 +576,43 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd ############################################################################## mirtarbase_edges: Tuple[Edge, ...] = ( - Edge('has_mirgene_target', (ebel_relation,), props=( - OProperty('support_type', ODataType.STRING), - OProperty('pmid', ODataType.INTEGER), - OProperty('experiments', ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), - ), in_out=(rna, micro_rna)), + Edge( + "has_mirgene_target", + (ebel_relation,), + props=( + OProperty("support_type", ODataType.STRING), + OProperty("pmid", ODataType.INTEGER), + OProperty("experiments", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), + ), + in_out=(rna, micro_rna), + ), ) ############################################################################## # Definition of GwasCatalog generics and indices ############################################################################## -snp = Node('snp', (ebel,), False, ( - OProperty('rs_number', ODataType.STRING, node_view_label=True), -), own_class=False) +snp = Node( + "snp", + (ebel,), + False, + (OProperty("rs_number", ODataType.STRING, node_view_label=True),), + own_class=False, +) gwascatalog_nodes: Tuple[Node, ...] = (snp,) -has_snp = Edge('has_snp', (ebel_relation,), abstract=True, own_class=False, in_out=(snp, gene)) -has_mapped_snp = Edge('has_mapped_snp', (has_snp,), abstract=True, own_class=False) -has_downstream_snp = Edge('has_downstream_snp', (has_snp,), abstract=True, own_class=False) -has_upstream_snp = Edge('has_upstream_snp', (has_snp,), abstract=True, own_class=False) -has_snp_gwascatalog = Edge('has_snp_gwascatalog', (has_snp,), abstract=True, props=( - OProperty('disease_trait', ODataType.STRING), - OProperty('pubmed_id', ODataType.INTEGER) -)) +has_snp = Edge("has_snp", (ebel_relation,), abstract=True, own_class=False, in_out=(snp, gene)) +has_mapped_snp = Edge("has_mapped_snp", (has_snp,), abstract=True, own_class=False) +has_downstream_snp = Edge("has_downstream_snp", (has_snp,), abstract=True, own_class=False) +has_upstream_snp = Edge("has_upstream_snp", (has_snp,), abstract=True, own_class=False) +has_snp_gwascatalog = Edge( + "has_snp_gwascatalog", + (has_snp,), + abstract=True, + props=( + OProperty("disease_trait", ODataType.STRING), + OProperty("pubmed_id", ODataType.INTEGER), + ), +) gwascatalog_edges: Tuple[Edge, ...] = ( has_snp, @@ -534,37 +620,46 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd has_mapped_snp, has_downstream_snp, has_upstream_snp, - Edge('has_mapped_snp_gc', (has_mapped_snp, has_snp_gwascatalog)), - Edge('has_downstream_snp_gc', (has_downstream_snp, has_snp_gwascatalog)), - Edge('has_upstream_snp_gc', (has_upstream_snp, has_snp_gwascatalog)), + Edge("has_mapped_snp_gc", (has_mapped_snp, has_snp_gwascatalog)), + Edge("has_downstream_snp_gc", (has_downstream_snp, has_snp_gwascatalog)), + Edge("has_upstream_snp_gc", (has_upstream_snp, has_snp_gwascatalog)), ) ############################################################################## # Definition of Ortholog generics and indices ############################################################################## -drug = Node('drug', (ebel,), abstract=True, own_class=False) -drug_db = Node('drug_db', (drug,), props=( - OProperty('label', ODataType.STRING, node_view_label=True), - OProperty('drugbank_id', ODataType.STRING), - OProperty('description', ODataType.STRING), - OProperty('cas_number', ODataType.STRING, node_view_sub_label=True), - OProperty('indication', ODataType.STRING), - OProperty('pharmacodynamics', ODataType.STRING), - OProperty('toxicity', ODataType.STRING), - OProperty('metabolism', ODataType.STRING), - OProperty('mechanism_of_action', ODataType.STRING), -)) +drug = Node("drug", (ebel,), abstract=True, own_class=False) +drug_db = Node( + "drug_db", + (drug,), + props=( + OProperty("label", ODataType.STRING, node_view_label=True), + OProperty("drugbank_id", ODataType.STRING), + OProperty("description", ODataType.STRING), + OProperty("cas_number", ODataType.STRING, node_view_sub_label=True), + OProperty("indication", ODataType.STRING), + OProperty("pharmacodynamics", ODataType.STRING), + OProperty("toxicity", ODataType.STRING), + OProperty("metabolism", ODataType.STRING), + OProperty("mechanism_of_action", ODataType.STRING), + ), +) drugbank_nodes: Tuple[Node, ...] = ( drug, drug_db, ) -has_drug_target = Edge('has_drug_target', (ebel_relation,), abstract=True, own_class=False) -has_drug_target_db = Edge('has_drug_target_db', (has_drug_target,), in_out=(protein, drug_db), props=( - OProperty("action", ODataType.STRING), - OProperty("known_action", ODataType.STRING), -)) +has_drug_target = Edge("has_drug_target", (ebel_relation,), abstract=True, own_class=False) +has_drug_target_db = Edge( + "has_drug_target_db", + (has_drug_target,), + in_out=(protein, drug_db), + props=( + OProperty("action", ODataType.STRING), + OProperty("known_action", ODataType.STRING), + ), +) drugbank_edges: Tuple[Edge, ...] = ( has_drug_target, @@ -572,35 +667,40 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd ) drugbank_indices = ( - OIndex(has_drug_target_db, ('action',), OIndexType.NOTUNIQUE_HASH_INDEX), - OIndex(drug_db, ('drugbank_id',), OIndexType.UNIQUE_HASH_INDEX), + OIndex(has_drug_target_db, ("action",), OIndexType.NOTUNIQUE_HASH_INDEX), + OIndex(drug_db, ("drugbank_id",), OIndexType.UNIQUE_HASH_INDEX), ) ############################################################################## # Definition of IUPHAR's Guide to Pharmacology drugs ############################################################################## -iuphar_interaction = Edge('iuphar_interaction', (ebel_relation,), props=( - OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), - OProperty("assay_description", ODataType.STRING), - OProperty("affinity_units", ODataType.STRING), - OProperty("affinity_low", ODataType.FLOAT), - OProperty("affinity_median", ODataType.FLOAT), - OProperty("affinity_high", ODataType.FLOAT), - OProperty("type", ODataType.STRING), - OProperty("action", ODataType.STRING), -), in_out=(bel, bel)) +iuphar_interaction = Edge( + "iuphar_interaction", + (ebel_relation,), + props=( + OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), + OProperty("assay_description", ODataType.STRING), + OProperty("affinity_units", ODataType.STRING), + OProperty("affinity_low", ODataType.FLOAT), + OProperty("affinity_median", ODataType.FLOAT), + OProperty("affinity_high", ODataType.FLOAT), + OProperty("type", ODataType.STRING), + OProperty("action", ODataType.STRING), + ), + in_out=(bel, bel), +) iuphar_edges: Tuple[Edge, ...] = ( iuphar_interaction, - Edge('agonist_of__iu', (iuphar_interaction,)), - Edge('inhibits__iu', (iuphar_interaction,)), - Edge('antagonist_of__iu', (iuphar_interaction,)), - Edge('channel_blocker_of__iu', (iuphar_interaction,)), - Edge('allosteric_modulator_of__iu', (iuphar_interaction,)), - Edge('activates__iu', (iuphar_interaction,)), - Edge('antibody_against__iu', (iuphar_interaction,)), - Edge('inhibits_gating__iu', (iuphar_interaction,)), + Edge("agonist_of__iu", (iuphar_interaction,)), + Edge("inhibits__iu", (iuphar_interaction,)), + Edge("antagonist_of__iu", (iuphar_interaction,)), + Edge("channel_blocker_of__iu", (iuphar_interaction,)), + Edge("allosteric_modulator_of__iu", (iuphar_interaction,)), + Edge("activates__iu", (iuphar_interaction,)), + Edge("antibody_against__iu", (iuphar_interaction,)), + Edge("inhibits_gating__iu", (iuphar_interaction,)), ) ############################################################################## @@ -609,28 +709,34 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd reactome_nodes: Tuple[Node, ...] = ( deepcopy(protein).add_properties( - [OProperty('reactome_pathways', ODataType.EMBEDDEDSET, linked_type=ODataType.STRING)]), + [OProperty("reactome_pathways", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING)] + ), ) ############################################################################## # Definition of Reactome generics and indices ############################################################################## -has_ppi_bg = Edge('has_ppi_bg', (has_ppi,), abstract=True, props=( - OProperty("modification", ODataType.STRING), - OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), - OProperty("dois", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), - OProperty("biogrid_ids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), -)) +has_ppi_bg = Edge( + "has_ppi_bg", + (has_ppi,), + abstract=True, + props=( + OProperty("modification", ODataType.STRING), + OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), + OProperty("dois", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), + OProperty("biogrid_ids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), + ), +) biogrid_edges: Tuple[Edge, ...] = ( has_ppi_bg, - Edge('decreases_bg', (has_ppi_bg,)), + Edge("decreases_bg", (has_ppi_bg,)), ) biogrid_edges_auto_generated = [] for pmod in normalized_pmod.keys(): - for effect in ['increases', 'decreases']: + for effect in ["increases", "decreases"]: edge_name = f"{effect}_{pmod}_bg" biogrid_edges_auto_generated.append(Edge(edge_name, (has_ppi_bg,))) @@ -640,106 +746,132 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd # Definition of StringDB generics and indices ############################################################################## -has_action = Edge('has_action', (ebel_relation,), abstract=True, own_class=False, in_out=(protein, protein)) -has_action_st = Edge('has_action_st', (has_action,), abstract=True) -has_ppi_st = Edge('has_ppi_st', (has_ppi,), abstract=True, props=( - OProperty("score", ODataType.INTEGER), -)) -controls_expression_of_st = Edge('controls_expression_of_st', (has_action_st,), abstract=True) -increases_expression_of = Edge('increases_expression_of', abstract=True, own_class=False) -decreases_expression_of = Edge('decreases_expression_of', abstract=True, own_class=False) +has_action = Edge( + "has_action", + (ebel_relation,), + abstract=True, + own_class=False, + in_out=(protein, protein), +) +has_action_st = Edge("has_action_st", (has_action,), abstract=True) +has_ppi_st = Edge( + "has_ppi_st", + (has_ppi,), + abstract=True, + props=(OProperty("score", ODataType.INTEGER),), +) +controls_expression_of_st = Edge("controls_expression_of_st", (has_action_st,), abstract=True) +increases_expression_of = Edge("increases_expression_of", abstract=True, own_class=False) +decreases_expression_of = Edge("decreases_expression_of", abstract=True, own_class=False) stringdb_edges: Tuple[Edge, ...] = ( has_ppi, - Edge('has_ppi_st', (has_ppi,), False, props=( - OProperty("neighborhood", ODataType.INTEGER), - OProperty("neighborhood_transferred", ODataType.INTEGER), - OProperty("fusion", ODataType.INTEGER), - OProperty("cooccurence", ODataType.INTEGER), - OProperty("homology", ODataType.INTEGER), - OProperty("coexpression", ODataType.INTEGER), - OProperty("coexpression_transferred", ODataType.INTEGER), - OProperty("experiments", ODataType.INTEGER), - OProperty("experiments_transferred", ODataType.INTEGER), - OProperty("database", ODataType.INTEGER), - OProperty("database_transferred", ODataType.INTEGER), - OProperty("textmining", ODataType.INTEGER), - OProperty("textmining_transferred", ODataType.INTEGER), - OProperty("combined_score", ODataType.INTEGER), - )), - + Edge( + "has_ppi_st", + (has_ppi,), + False, + props=( + OProperty("neighborhood", ODataType.INTEGER), + OProperty("neighborhood_transferred", ODataType.INTEGER), + OProperty("fusion", ODataType.INTEGER), + OProperty("cooccurence", ODataType.INTEGER), + OProperty("homology", ODataType.INTEGER), + OProperty("coexpression", ODataType.INTEGER), + OProperty("coexpression_transferred", ODataType.INTEGER), + OProperty("experiments", ODataType.INTEGER), + OProperty("experiments_transferred", ODataType.INTEGER), + OProperty("database", ODataType.INTEGER), + OProperty("database_transferred", ODataType.INTEGER), + OProperty("textmining", ODataType.INTEGER), + OProperty("textmining_transferred", ODataType.INTEGER), + OProperty("combined_score", ODataType.INTEGER), + ), + ), has_action, has_action_st, has_ppi_st, - Edge('activates_st', (has_action_st,)), - Edge('inhibits_st', (has_action_st,)), + Edge("activates_st", (has_action_st,)), + Edge("inhibits_st", (has_action_st,)), controls_expression_of_st, increases_expression_of, decreases_expression_of, - Edge('increases_expression_of_st', (has_action_st, controls_expression_of_st, increases_expression_of)), - Edge('decreases_expression_of_st', (has_action_st, controls_expression_of_st, decreases_expression_of)), - Edge('controls_pmod_of_st', (has_action_st,)), + Edge( + "increases_expression_of_st", + (has_action_st, controls_expression_of_st, increases_expression_of), + ), + Edge( + "decreases_expression_of_st", + (has_action_st, controls_expression_of_st, decreases_expression_of), + ), + Edge("controls_pmod_of_st", (has_action_st,)), ) ############################################################################## # Definition of ClinVar generics and indices ############################################################################## -clinvar_nodes: Tuple[Node, ...] = ( - snp, +clinvar_nodes: Tuple[Node, ...] = (snp,) + +has_snp_clinvar = Edge( + "has_snp_clinvar", + (has_snp,), + abstract=True, + props=( + OProperty("keyword", ODataType.STRING), + OProperty("clinical_significance", ODataType.STRING), + OProperty("phenotype", ODataType.STRING), + ), ) -has_snp_clinvar = Edge('has_snp_clinvar', (has_snp,), abstract=True, props=( - OProperty('keyword', ODataType.STRING), - OProperty('clinical_significance', ODataType.STRING), - OProperty('phenotype', ODataType.STRING), -)) - clinvar_edges: Tuple[Edge, ...] = ( has_snp, has_snp_clinvar, has_mapped_snp, has_downstream_snp, has_upstream_snp, - Edge('has_mapped_snp_cv', (has_mapped_snp, has_snp_clinvar)), - Edge('has_downstream_snp_cv', (has_downstream_snp, has_snp_clinvar)), - Edge('has_upstream_snp_cv', (has_upstream_snp, has_snp_clinvar)), + Edge("has_mapped_snp_cv", (has_mapped_snp, has_snp_clinvar)), + Edge("has_downstream_snp_cv", (has_downstream_snp, has_snp_clinvar)), + Edge("has_upstream_snp_cv", (has_upstream_snp, has_snp_clinvar)), ) -clinvar_indices = ( - OIndex(snp, ('rs_number',), OIndexType.UNIQUE_HASH_INDEX), -) +clinvar_indices = (OIndex(snp, ("rs_number",), OIndexType.UNIQUE_HASH_INDEX),) ############################################################################## # Definition of ClinVar generics and indices # http://stitch.embl.de ############################################################################## -stitch = Generic('stitch', props=( - OProperty('pubchem_id', ODataType.INTEGER), - OProperty('ensembl_protein_id', ODataType.INTEGER), - OProperty('experimental_direct', ODataType.INTEGER), - OProperty('experimental_transferred', ODataType.INTEGER), - OProperty('prediction_direct', ODataType.INTEGER), - OProperty('prediction_transferred', ODataType.INTEGER), - OProperty('database_direct', ODataType.INTEGER), - OProperty('database_transferred', ODataType.INTEGER), - OProperty('textmining_direct', ODataType.INTEGER), - OProperty('textmining_transferred', ODataType.INTEGER), - OProperty('combined_score', ODataType.INTEGER), - OProperty('type', ODataType.STRING), - OProperty('hgnc', ODataType.LINK, linked_class=hgnc.name), - OProperty('pubchem', ODataType.LINK, 'stitch_pubchem'), -)) +stitch = Generic( + "stitch", + props=( + OProperty("pubchem_id", ODataType.INTEGER), + OProperty("ensembl_protein_id", ODataType.INTEGER), + OProperty("experimental_direct", ODataType.INTEGER), + OProperty("experimental_transferred", ODataType.INTEGER), + OProperty("prediction_direct", ODataType.INTEGER), + OProperty("prediction_transferred", ODataType.INTEGER), + OProperty("database_direct", ODataType.INTEGER), + OProperty("database_transferred", ODataType.INTEGER), + OProperty("textmining_direct", ODataType.INTEGER), + OProperty("textmining_transferred", ODataType.INTEGER), + OProperty("combined_score", ODataType.INTEGER), + OProperty("type", ODataType.STRING), + OProperty("hgnc", ODataType.LINK, linked_class=hgnc.name), + OProperty("pubchem", ODataType.LINK, "stitch_pubchem"), + ), +) stitch_generics: Tuple[Generic, ...] = ( - Generic('stitch_pubchem', props=( - OProperty('CID', ODataType.INTEGER), - OProperty('IUPACName', ODataType.STRING), - OProperty('MolecularFormula', ODataType.STRING), - OProperty('IsomericSMILES', ODataType.STRING), - OProperty('InChI', ODataType.STRING), - OProperty('InChIKey', ODataType.STRING), - )), + Generic( + "stitch_pubchem", + props=( + OProperty("CID", ODataType.INTEGER), + OProperty("IUPACName", ODataType.STRING), + OProperty("MolecularFormula", ODataType.STRING), + OProperty("IsomericSMILES", ODataType.STRING), + OProperty("InChI", ODataType.STRING), + OProperty("InChIKey", ODataType.STRING), + ), + ), stitch, ) @@ -760,34 +892,40 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd # Definition of DisGeNet ############################################################################## -disgenet_nodes: Tuple[Node, ...] = ( - snp, +disgenet_nodes: Tuple[Node, ...] = (snp,) + +gene_disease_association = Edge("gene_disease_association", (ebel_relation,), abstract=True) +has_snp_disgenet = Edge( + "has_snp_disgenet", + (has_snp,), + abstract=True, + props=( + OProperty("disease_name", ODataType.STRING), + OProperty("pmid", ODataType.INTEGER), + OProperty("score", ODataType.FLOAT), + OProperty("source", ODataType.STRING), + ), ) -gene_disease_association = Edge('gene_disease_association', (ebel_relation,), abstract=True) -has_snp_disgenet = Edge('has_snp_disgenet', (has_snp,), abstract=True, props=( - OProperty('disease_name', ODataType.STRING), - OProperty('pmid', ODataType.INTEGER), - OProperty('score', ODataType.FLOAT), - OProperty('source', ODataType.STRING), -)) - disgenet_edges: Tuple[Edge, ...] = ( gene_disease_association, - Edge('disgenet_gene_disease', (gene_disease_association,), props=( - OProperty('pmid', ODataType.INTEGER), - OProperty('score', ODataType.FLOAT), - OProperty('source', ODataType.STRING), - )), - + Edge( + "disgenet_gene_disease", + (gene_disease_association,), + props=( + OProperty("pmid", ODataType.INTEGER), + OProperty("score", ODataType.FLOAT), + OProperty("source", ODataType.STRING), + ), + ), has_snp, has_snp_disgenet, has_mapped_snp, has_downstream_snp, has_upstream_snp, - Edge('has_mapped_snp_dgn', (has_mapped_snp, has_snp_disgenet)), - Edge('has_downstream_snp_dgn', (has_downstream_snp, has_snp_disgenet)), - Edge('has_upstream_snp_dgn', (has_upstream_snp, has_snp_disgenet)), + Edge("has_mapped_snp_dgn", (has_mapped_snp, has_snp_disgenet)), + Edge("has_downstream_snp_dgn", (has_downstream_snp, has_snp_disgenet)), + Edge("has_upstream_snp_dgn", (has_upstream_snp, has_snp_disgenet)), ) ############################################################################## @@ -795,103 +933,124 @@ def __init__(self, odb_class: OClass, columns: Tuple[str, ...], index_type: OInd ############################################################################## pathway_commons_generics: Tuple[Generic, ...] = ( - Generic('pc_pathway_name', props=( - OProperty('name', ODataType.STRING), - )), + Generic("pc_pathway_name", props=(OProperty("name", ODataType.STRING),)), +) +has_action_pc = Edge( + "has_action_pc", + (has_action,), + abstract=True, + props=( + OProperty("pathways", ODataType.LINKSET, "pc_pathway_name"), + OProperty("sources", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), + OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), + OProperty("type", ODataType.STRING), + ), ) -has_action_pc = Edge('has_action_pc', (has_action,), abstract=True, props=( - OProperty("pathways", ODataType.LINKSET, 'pc_pathway_name'), - OProperty("sources", ODataType.EMBEDDEDSET, linked_type=ODataType.STRING), - OProperty("pmids", ODataType.EMBEDDEDSET, linked_type=ODataType.INTEGER), - OProperty("type", ODataType.STRING), -)) pathway_commons_edges: Tuple[Edge, ...] = ( has_action, has_action_pc, - Edge('controls_expression_of_pc', (has_action_pc,)), - Edge('controls_phosphorylation_of_pc', (has_action_pc,)), - Edge('controls_transport_of_pc', (has_action_pc,)), + Edge("controls_expression_of_pc", (has_action_pc,)), + Edge("controls_phosphorylation_of_pc", (has_action_pc,)), + Edge("controls_transport_of_pc", (has_action_pc,)), ) ############################################################################## # Definition of KEGG generics, edges and indices ############################################################################## -pathway_interaction = Edge('pathway_interaction', (ebel_relation,), abstract=True, own_class=False) -has_ppi_kg = Edge('has_ppi_kg', (pathway_interaction,), abstract=True, props=( - OProperty("interaction_type", ODataType.STRING), - OProperty('pathway_names', ODataType.EMBEDDEDSET), -)) -increases_expression_of = Edge('increases_expression_of', abstract=True, own_class=False) -decreases_expression_of = Edge('decreases_expression_of', abstract=True, own_class=False) +pathway_interaction = Edge("pathway_interaction", (ebel_relation,), abstract=True, own_class=False) +has_ppi_kg = Edge( + "has_ppi_kg", + (pathway_interaction,), + abstract=True, + props=( + OProperty("interaction_type", ODataType.STRING), + OProperty("pathway_names", ODataType.EMBEDDEDSET), + ), +) +increases_expression_of = Edge("increases_expression_of", abstract=True, own_class=False) +decreases_expression_of = Edge("decreases_expression_of", abstract=True, own_class=False) kegg_edges: Tuple[Edge, ...] = ( pathway_interaction, has_ppi_kg, increases_expression_of, decreases_expression_of, - Edge('decreases_pho_kg', (has_ppi_kg,)), - Edge('increases_gly_kg', (has_ppi_kg,)), - Edge('increases_me0_kg', (has_ppi_kg,)), - Edge('increases_pho_kg', (has_ppi_kg,)), - Edge('increases_ubi_kg', (has_ppi_kg,)), + Edge("decreases_pho_kg", (has_ppi_kg,)), + Edge("increases_gly_kg", (has_ppi_kg,)), + Edge("increases_me0_kg", (has_ppi_kg,)), + Edge("increases_pho_kg", (has_ppi_kg,)), + Edge("increases_ubi_kg", (has_ppi_kg,)), ) ############################################################################## # NSides definitions ############################################################################## -side_effect = Node('side_effect', (ebel,), props=( - OProperty('label', ODataType.STRING, node_view_label=True), - OProperty('condition_meddra_id', ODataType.STRING, node_view_sub_label=True), -)) - -nsides_nodes: Tuple[Node, ...] = ( - side_effect, +side_effect = Node( + "side_effect", + (ebel,), + props=( + OProperty("label", ODataType.STRING, node_view_label=True), + OProperty("condition_meddra_id", ODataType.STRING, node_view_sub_label=True), + ), ) +nsides_nodes: Tuple[Node, ...] = (side_effect,) + nsides_edges: Tuple[Edge] = ( - Edge('has_side_effect', (ebel_relation,), props=( - OProperty('prr', ODataType.FLOAT), - OProperty('mean_reporting_frequency', ODataType.FLOAT), - ), in_out=(side_effect, drug_db)), + Edge( + "has_side_effect", + (ebel_relation,), + props=( + OProperty("prr", ODataType.FLOAT), + OProperty("mean_reporting_frequency", ODataType.FLOAT), + ), + in_out=(side_effect, drug_db), + ), ) ############################################################################## # All OrientDB classes as Dict ############################################################################## -nodes: Tuple[Node, ...] = (bel_nodes - + hgnc_nodes - + gwascatalog_nodes - + drugbank_nodes - + reactome_nodes - + clinvar_nodes - + disgenet_nodes - + nsides_nodes) - -edges: Tuple[Edge, ...] = (bel_edges - + intact_edges - + mirtarbase_edges - + gwascatalog_edges - + drugbank_edges - + iuphar_edges - + biogrid_edges - + biogrid_edges - + stringdb_edges - + clinvar_edges - + disgenet_edges - + kegg_edges - + nsides_edges) +nodes: Tuple[Node, ...] = ( + bel_nodes + + hgnc_nodes + + gwascatalog_nodes + + drugbank_nodes + + reactome_nodes + + clinvar_nodes + + disgenet_nodes + + nsides_nodes +) + +edges: Tuple[Edge, ...] = ( + bel_edges + + intact_edges + + mirtarbase_edges + + gwascatalog_edges + + drugbank_edges + + iuphar_edges + + biogrid_edges + + biogrid_edges + + stringdb_edges + + clinvar_edges + + disgenet_edges + + kegg_edges + + nsides_edges +) nodes_and_edges: Tuple[OClass, ...] = nodes + edges nodes_and_edges_dict: Dict[str, OClass] = {o_class.name: o_class for o_class in nodes_and_edges} -non_abstract_nodes_and_edges_dict: Dict[str, OClass] = {o_class.name: o_class for o_class in nodes_and_edges if - o_class.abstract is False} -abstract_nodes_and_edges_dict: Dict[str, OClass] = {o_class.name: o_class for o_class in nodes_and_edges if - o_class.abstract is True} +non_abstract_nodes_and_edges_dict: Dict[str, OClass] = { + o_class.name: o_class for o_class in nodes_and_edges if o_class.abstract is False +} +abstract_nodes_and_edges_dict: Dict[str, OClass] = { + o_class.name: o_class for o_class in nodes_and_edges if o_class.abstract is True +} def get_columns(class_name, columns: Tuple[str] = (), exclude_non_serializable: bool = True) -> list: @@ -912,7 +1071,12 @@ def get_columns(class_name, columns: Tuple[str] = (), exclude_non_serializable: List of columns. """ o_class: OClass = nodes_and_edges_dict[class_name] - exclude_classes = [ODataType.LINK, ODataType.LINKSET, ODataType.LINKMAP, ODataType.LINKBAG] + exclude_classes = [ + ODataType.LINK, + ODataType.LINKSET, + ODataType.LINKMAP, + ODataType.LINKBAG, + ] columns = list(columns) @@ -921,7 +1085,7 @@ def get_columns(class_name, columns: Tuple[str] = (), exclude_non_serializable: else: columns += [p.prop_name for p in o_class.props] for extends_name in o_class.extends: - if extends_name not in ['V', 'E']: + if extends_name not in ["V", "E"]: columns += get_columns(extends_name, tuple(columns), exclude_non_serializable) return columns @@ -942,15 +1106,15 @@ def get_node_view_labels(name: str, labels: dict = None) -> dict: {"label": list_of_labels, "sub_label": list_of_sub_labels} """ if not labels: - labels = {'label': [], 'sub_label': []} + labels = {"label": [], "sub_label": []} o_class = nodes_and_edges_dict[name] for p in o_class.props: if p.node_view_label: - labels['label'].append(p.prop_name) + labels["label"].append(p.prop_name) if p.node_view_sub_label: - labels['sub_label'].append(p.prop_name) - if not labels['label']: + labels["sub_label"].append(p.prop_name) + if not labels["label"]: for extends_name in o_class.extends: - if extends_name not in ['V', 'E']: + if extends_name not in ["V", "E"]: get_node_view_labels(extends_name, labels) return labels diff --git a/ebel/manager/orientdb/urls.py b/ebel/manager/orientdb/urls.py index 594cdb2..cbf84f5 100755 --- a/ebel/manager/orientdb/urls.py +++ b/ebel/manager/orientdb/urls.py @@ -3,14 +3,17 @@ # HGNC # HGNC_JSON = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/hgnc_complete_set.json" HGNC_TSV = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt" -HCOP_GZIP = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz' +HCOP_GZIP = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_all_hcop_sixteen_column.txt.gz" # UniProt # -UNIPROT_SPROT = "ftp://ftp.uniprot.org/pub/databases/uniprot/" \ - "current_release/knowledgebase/complete/uniprot_sprot.xml.gz" -UNIPROT_HGNC = "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=md_prot_id&" \ - "status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&" \ - "format=text&submit=submit" +UNIPROT_SPROT = ( + "ftp://ftp.uniprot.org/pub/databases/uniprot/" "current_release/knowledgebase/complete/uniprot_sprot.xml.gz" +) +UNIPROT_HGNC = ( + "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=md_prot_id&" + "status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&" + "format=text&submit=submit" +) UNIPROT_MGI = "http://www.informatics.jax.org/downloads/reports/MRK_SwissProt.rpt" UNIPROT_RGD = "https://download.rgd.mcw.edu/data_release/GENES_RAT.txt" UNIPROT_FLYBASE = "ftp://ftp.flybase.org/releases/current/precomputed_files/genes/fbgn_NAseq_Uniprot_fb_2020_04.tsv.gz" @@ -26,19 +29,20 @@ # PPIs # BIND = "https://www.bindingdb.org/bind/downloads/BindingDB_All_2018m7.tsv.zip" -BIOGRID = \ +BIOGRID = ( "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.215/BIOGRID-ALL-4.4.215.tab3.zip" +) INTACT = "ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip" STITCH = "http://stitch.embl.de/download/protein_chemical.links.transfer.v5.0.tsv.gz" # String # -STRING_INTS = "https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.5.txt.gz" -STRING_ACTIONS = "https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.5.txt.gz" -STRING_NAMES = "https://stringdb-static.org/download/protein.info.v11.0/9606.protein.info.v11.5.txt.gz" +STRING_INTS = "https://stringdb-static.org/download/protein.links.full.v11.5/9606.protein.links.full.v11.5.txt.gz" +STRING_ACTIONS = "https://stringdb-static.org/download/protein.actions.v11.0/9606.protein.actions.v11.0.txt.gz" +STRING_NAMES = "https://stringdb-static.org/download/protein.info.v11.5/9606.protein.info.v11.5.txt.gz" # Pathway DBs # KEGG_PATH_LIST = "http://rest.kegg.jp/list/pathway/hsa" -PATHWAY_COMMONS = "https://www.pathwaycommons.org/archives/PC2/v13/PathwayCommons13.Detailed.hgnc.txt.gz" +PATHWAY_COMMONS = "https://www.pathwaycommons.org/archives/PC2/v12/PathwayCommons12.Detailed.hgnc.txt.gz" REACTOME = "https://reactome.org/download/current/UniProt2Reactome.txt" # TODO: Import from Reactome MySQL # REACTOME MySQL has a strange database structure and no controlled vocabulary, reactions are not classified @@ -66,7 +70,7 @@ # Drugs and Side Effects # OFFSIDES = "http://tatonettilab.org/resources/nsides/OFFSIDES.csv.xz" -ONSIDES = "https://github.com/tatonetti-lab/onsides/releases/download/v01/onsides_v01_20220430.tar.gz" +ONSIDES = "https://github.com/tatonetti-lab/onsides/releases/download/v2.0.0-20230629/onsides_v2.0.0_20230629.tar.gz" # DrugBank DRUGBANK_VERSION = "https://go.drugbank.ca/releases/latest#full" DRUGBANK_DATA = "https://go.drugbank.com/releases/{}/downloads/all-full-database" @@ -88,8 +92,9 @@ # CHEBI_COMPOUNDORIGIN = f"{CHEBI_BASE}compound_origins.tsv" # NCBI # -NCBI_PMID = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=ebel&" \ - "ids={}&idtype=pmid&versions=no&format=json" +NCBI_PMID = ( + "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=ebel&" "ids={}&idtype=pmid&versions=no&format=json" +) NCBI_MESH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=" # NCBI Gene NCBI_GENE_INFO = "https://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz" diff --git a/ebel/manager/rdbms/__init__.py b/ebel/manager/rdbms/__init__.py index 27d663a..ecdf692 100644 --- a/ebel/manager/rdbms/__init__.py +++ b/ebel/manager/rdbms/__init__.py @@ -1,3 +1,2 @@ """RDBMS init.""" -from ebel.manager.rdbms import tools -from ebel.manager.rdbms import models +from ebel.manager.rdbms import models, tools diff --git a/ebel/manager/rdbms/models/__init__.py b/ebel/manager/rdbms/models/__init__.py index 48fbeaf..e398b18 100644 --- a/ebel/manager/rdbms/models/__init__.py +++ b/ebel/manager/rdbms/models/__init__.py @@ -4,5 +4,4 @@ def object_as_dict(obj, exclude: list = []) -> dict: """Return object values as a dictionary.""" - return {c.key: getattr(obj, c.key) - for c in inspect(obj).mapper.column_attrs if c.key not in exclude} + return {c.key: getattr(obj, c.key) for c in inspect(obj).mapper.column_attrs if c.key not in exclude} diff --git a/ebel/manager/rdbms/models/biogrid.py b/ebel/manager/rdbms/models/biogrid.py index 595b5bf..c3af157 100644 --- a/ebel/manager/rdbms/models/biogrid.py +++ b/ebel/manager/rdbms/models/biogrid.py @@ -1,59 +1,58 @@ """BioGRID RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Float, Text, ForeignKey +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict - Base = declarative_base() class Biogrid(Base): """Class definition for the biogrid table.""" - __tablename__ = 'biogrid' + __tablename__ = "biogrid" id = Column(Integer, primary_key=True) - biogrid_a_id = Column(Integer, ForeignKey('biogrid_interactor.biogrid_id')) + biogrid_a_id = Column(Integer, ForeignKey("biogrid_interactor.biogrid_id")) biogrid_a = relationship("Interactor", foreign_keys=[biogrid_a_id]) - biogrid_b_id = Column(Integer, ForeignKey('biogrid_interactor.biogrid_id')) + biogrid_b_id = Column(Integer, ForeignKey("biogrid_interactor.biogrid_id")) biogrid_b = relationship("Interactor", foreign_keys=[biogrid_b_id]) biogrid_id = Column(Integer, nullable=True) - experimental_system_id = Column(Integer, ForeignKey('biogrid_experimental_system.id')) + experimental_system_id = Column(Integer, ForeignKey("biogrid_experimental_system.id")) experimental_system = relationship("ExperimentalSystem", foreign_keys=[experimental_system_id]) - throughput_id = Column(Integer, ForeignKey('biogrid_throughput.id')) + throughput_id = Column(Integer, ForeignKey("biogrid_throughput.id")) throughput = relationship("Throughput", foreign_keys=[throughput_id]) score = Column(Float, nullable=True) - modification_id = Column(Integer, ForeignKey('biogrid_modification.id')) + modification_id = Column(Integer, ForeignKey("biogrid_modification.id")) modification = relationship("Modification", foreign_keys=[modification_id]) qualifications = Column(String(255), nullable=True) - source_id = Column(Integer, ForeignKey('biogrid_source.id')) + source_id = Column(Integer, ForeignKey("biogrid_source.id")) source = relationship("Source", foreign_keys=[source_id]) - publication_id = Column(Integer, ForeignKey('biogrid_publication.id')) + publication_id = Column(Integer, ForeignKey("biogrid_publication.id")) publication = relationship("Publication", foreign_keys=[publication_id]) qualification = Column(Text, nullable=True) def as_dict(self): """Convert object values to dictionary.""" return { - 'biogrid_a': self.biogrid_a.as_dict(), - 'biogrid_b': self.biogrid_b.as_dict(), - 'experimental_system': self.experimental_system.as_dict() if self.experimental_system else None, - 'throughput': self.throughput.as_dict() if self.throughput else None, - 'biogrid_id': self.biogrid_id, - 'score': self.score if self.score else None, - 'modification': self.modification.as_dict() if self.modification else None, - 'source': self.source.source, - 'publication': self.publication.as_dict(), - 'qualification': self.qualification if self.qualification else None + "biogrid_a": self.biogrid_a.as_dict(), + "biogrid_b": self.biogrid_b.as_dict(), + "experimental_system": self.experimental_system.as_dict() if self.experimental_system else None, + "throughput": self.throughput.as_dict() if self.throughput else None, + "biogrid_id": self.biogrid_id, + "score": self.score if self.score else None, + "modification": self.modification.as_dict() if self.modification else None, + "source": self.source.source, + "publication": self.publication.as_dict(), + "qualification": self.qualification if self.qualification else None, } class Publication(Base): """Class definition for the biogrid_publication table.""" - __tablename__ = 'biogrid_publication' + __tablename__ = "biogrid_publication" id = Column(Integer, primary_key=True) author_name = Column(String(255), nullable=True) publication_year = Column(Integer, nullable=True) @@ -62,41 +61,38 @@ class Publication(Base): def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class Throughput(Base): """Class definition for the biogrid_throughput table.""" - __tablename__ = 'biogrid_throughput' + __tablename__ = "biogrid_throughput" id = Column(Integer, primary_key=True) throughput = Column(String(255)) frequency = Column(Integer) def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class Taxonomy(Base): """Class definition for the biogrid_taxonomy table.""" - __tablename__ = 'biogrid_taxonomy' + __tablename__ = "biogrid_taxonomy" taxonomy_id = Column(Integer, primary_key=True) # == NCBI Taxonomy ID organism_name = Column(String(1000)) def as_dict(self): """Convert object values to dictionary.""" - return { - 'taxonomy_id': self.taxonomy_id, - 'organism_name': self.organism_name - } + return {"taxonomy_id": self.taxonomy_id, "organism_name": self.organism_name} class ExperimentalSystem(Base): """Class definition for the biogrid_experimental_system table.""" - __tablename__ = 'biogrid_experimental_system' + __tablename__ = "biogrid_experimental_system" id = Column(Integer, primary_key=True) experimental_system = Column(String(255), nullable=True) experimental_system_type = Column(String(255), nullable=True) @@ -104,19 +100,19 @@ class ExperimentalSystem(Base): def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class Interactor(Base): """Class definition for the biogrid_interactor table.""" - __tablename__ = 'biogrid_interactor' + __tablename__ = "biogrid_interactor" biogrid_id = Column(Integer, primary_key=True) entrez = Column(Integer, nullable=True, index=True) systematic_name = Column(String(255), nullable=True, index=True) symbol = Column(String(255), nullable=True, index=True) - taxonomy_id = Column(Integer, ForeignKey('biogrid_taxonomy.taxonomy_id')) + taxonomy_id = Column(Integer, ForeignKey("biogrid_taxonomy.taxonomy_id")) taxonomy = relationship("Taxonomy", foreign_keys=[taxonomy_id]) uniprot = Column(String(255), nullable=True, index=True) trembl = Column(String(1000), nullable=True) @@ -124,35 +120,35 @@ class Interactor(Base): def as_dict(self): """Convert object values to dictionary.""" return { - 'entrez': self.entrez, - 'systematic_name': self.systematic_name, - 'symbol': self.symbol, - 'uniprot': self.uniprot, - 'trembl': self.trembl, - 'taxonomy': self.taxonomy.as_dict() + "entrez": self.entrez, + "systematic_name": self.systematic_name, + "symbol": self.symbol, + "uniprot": self.uniprot, + "trembl": self.trembl, + "taxonomy": self.taxonomy.as_dict(), } class Source(Base): """Class definition for the biogrid_source table.""" - __tablename__ = 'biogrid_source' + __tablename__ = "biogrid_source" id = Column(Integer, primary_key=True) source = Column(String(255), nullable=True) def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class Modification(Base): """Class definition for the biogrid_modification table.""" - __tablename__ = 'biogrid_modification' + __tablename__ = "biogrid_modification" id = Column(Integer, primary_key=True) modification = Column(String(255), nullable=True) frequency = Column(Integer) def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) diff --git a/ebel/manager/rdbms/models/chebi.py b/ebel/manager/rdbms/models/chebi.py index c0e0559..28ea3ce 100644 --- a/ebel/manager/rdbms/models/chebi.py +++ b/ebel/manager/rdbms/models/chebi.py @@ -1,8 +1,9 @@ """CHEBI RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import (Column, DateTime, ForeignKey, Index, Integer, String, + Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime, Index +from sqlalchemy.orm import relationship Base = declarative_base() @@ -10,14 +11,14 @@ class ChemicalData(Base): """Class definition for the chebi_chemical_data table.""" - __tablename__ = 'chebi_chemical_data' + __tablename__ = "chebi_chemical_data" id = Column(Integer, primary_key=True) chemical_data = Column(Text, nullable=True) source = Column(Text, nullable=False) type = Column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="chemicalData") def __str__(self): @@ -27,16 +28,16 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'chemical_data': self.chemical_data, - 'source': self.source, - 'type': self.type, + "chemical_data": self.chemical_data, + "source": self.source, + "type": self.type, } class Comment(Base): """Class definition for the chebi_comment table.""" - __tablename__ = 'chebi_comment' + __tablename__ = "chebi_comment" id = Column(Integer, primary_key=True) text = Column(Text, nullable=False) @@ -44,7 +45,7 @@ class Comment(Base): datatype = Column(String(80)) datatype_id = Column(Integer, nullable=False) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="comments") def __str__(self): @@ -54,15 +55,15 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'text': self.text, - 'datatype': self.datatype, + "text": self.text, + "datatype": self.datatype, } class Compound(Base): """Class definition for the chebi_compound table.""" - __tablename__ = 'chebi_compound' + __tablename__ = "chebi_compound" id = Column(Integer, primary_key=True) name = Column(String(2000)) @@ -91,30 +92,30 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'name': self.name, - 'source': self.source, - 'parent_id': self.parent_id, - 'chebi_accession': self.chebi_accession, - 'status': self.status, - 'definition': self.definition, - 'chemicalData': [x.as_dict() for x in self.chemicalData], - 'comments': [x.as_dict() for x in self.comments], - 'database_accessions': [x.as_dict() for x in self.database_accessions], - 'names': [x.as_dict() for x in self.names], - 'references': [x.as_dict() for x in self.references], - 'inchis': [x.as_dict() for x in self.inchis] + "name": self.name, + "source": self.source, + "parent_id": self.parent_id, + "chebi_accession": self.chebi_accession, + "status": self.status, + "definition": self.definition, + "chemicalData": [x.as_dict() for x in self.chemicalData], + "comments": [x.as_dict() for x in self.comments], + "database_accessions": [x.as_dict() for x in self.database_accessions], + "names": [x.as_dict() for x in self.names], + "references": [x.as_dict() for x in self.references], + "inchis": [x.as_dict() for x in self.inchis], } class Inchi(Base): """Class definition for the chebi_inchi table.""" - __tablename__ = 'chebi_inchi' + __tablename__ = "chebi_inchi" id = Column(Integer, primary_key=True) inchi = Column(Text) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="inchis") def __str__(self): @@ -122,22 +123,20 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return { - 'inchi': self.inchi - } + return {"inchi": self.inchi} class DatabaseAccession(Base): """Class definition for the chebi_database_accession table.""" - __tablename__ = 'chebi_database_accession' + __tablename__ = "chebi_database_accession" id = Column(Integer, primary_key=True) accession_number = Column(String(255), nullable=True) type = Column(Text, nullable=False) source = Column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="database_accessions") def __str__(self): @@ -146,16 +145,16 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'accession_number': self.accession_number, - 'type': self.type, - 'source': self.source, + "accession_number": self.accession_number, + "type": self.type, + "source": self.source, } class Name(Base): """Class definition for the chebi_name table.""" - __tablename__ = 'chebi_name' + __tablename__ = "chebi_name" id = Column(Integer, primary_key=True) name = Column(Text, nullable=True) @@ -164,7 +163,7 @@ class Name(Base): adapted = Column(Text, nullable=False) language = Column(Text, nullable=False) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="names") def __str__(self): @@ -173,18 +172,18 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'name': self.name, - 'type': self.type, - 'source': self.source, - 'adapted': self.adapted, - 'language': self.language + "name": self.name, + "type": self.type, + "source": self.source, + "adapted": self.adapted, + "language": self.language, } class Reference(Base): """Class definition for the chebi_reference table.""" - __tablename__ = 'chebi_reference' + __tablename__ = "chebi_reference" id = Column(Integer, primary_key=True) @@ -193,10 +192,10 @@ class Reference(Base): location_in_ref = Column(String(90), index=True) reference_name = Column(String(1024)) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="references") - __table_args__ = (Index('ix_chebi_reference__reference_name', reference_name, mysql_length=500),) + __table_args__ = (Index("ix_chebi_reference__reference_name", reference_name, mysql_length=500),) def __str__(self): return f"{self.reference_db_name}:{self.reference_id}" @@ -204,34 +203,34 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'reference_id': self.reference_id, - 'reference_db_name': self.reference_db_name, - 'location_in_ref': self.location_in_ref, - 'reference_name': self.reference_name, + "reference_id": self.reference_id, + "reference_db_name": self.reference_db_name, + "location_in_ref": self.location_in_ref, + "reference_name": self.reference_name, } def as_dict_with_compound_id(self): """Convert object values to dictionary with compound ID.""" return { - 'reference_id': self.reference_id, - 'reference_db_name': self.reference_db_name, - 'location_in_ref': self.location_in_ref, - 'reference_name': self.reference_name, - 'compound_id': self.compound_id + "reference_id": self.reference_id, + "reference_db_name": self.reference_db_name, + "location_in_ref": self.location_in_ref, + "reference_name": self.reference_name, + "compound_id": self.compound_id, } class Relation(Base): """Class definition for the chebi_relation table.""" - __tablename__ = 'chebi_relation' + __tablename__ = "chebi_relation" id = Column(Integer, primary_key=True) type = Column(Text, nullable=False) status = Column(String(1), nullable=False) - final_id = Column(Integer, ForeignKey('chebi_compound.id')) - init_id = Column(Integer, ForeignKey('chebi_compound.id')) + final_id = Column(Integer, ForeignKey("chebi_compound.id")) + init_id = Column(Integer, ForeignKey("chebi_compound.id")) final_id_compounds = relationship("Compound", foreign_keys=[final_id]) init_id_compounds = relationship("Compound", foreign_keys=[init_id]) @@ -242,17 +241,17 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'type': self.type, - 'status': self.status, - 'final_id': self.final_id, - 'init_id': self.init_id + "type": self.type, + "status": self.status, + "final_id": self.final_id, + "init_id": self.init_id, } class Structure(Base): """Class definition for the chebi_structure table.""" - __tablename__ = 'chebi_structure' + __tablename__ = "chebi_structure" id = Column(Integer, primary_key=True) structure = Column(Text, nullable=False) @@ -261,7 +260,7 @@ class Structure(Base): default_structure = Column(String(1), nullable=False) autogen_structure = Column(String(1), nullable=False) - compound_id = Column(Integer, ForeignKey('chebi_compound.id')) + compound_id = Column(Integer, ForeignKey("chebi_compound.id")) compounds = relationship("Compound", back_populates="structures") def __str__(self): diff --git a/ebel/manager/rdbms/models/clinical_trials_gov.py b/ebel/manager/rdbms/models/clinical_trials_gov.py index d5e086c..f2f02ba 100644 --- a/ebel/manager/rdbms/models/clinical_trials_gov.py +++ b/ebel/manager/rdbms/models/clinical_trials_gov.py @@ -1,51 +1,87 @@ """ClinicalTrials.gov RDBMS model definition.""" import re -from sqlalchemy.orm import relationship +from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Table, Text, ForeignKey +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict - Base = declarative_base() -ctg_keyword_n2m = Table('clinical_trials_gov__keyword', Base.metadata, - Column('clinical_trials_gov_keyword_id', Integer, ForeignKey('clinical_trials_gov_keyword.id'), - index=True), - Column('clinical_trials_gov_id', Integer, - ForeignKey('clinical_trials_gov.id'), index=True) - ) - -ctg_condition_n2m = Table('clinical_trials_gov__condition', Base.metadata, - Column('clinical_trials_gov_condition_id', Integer, - ForeignKey('clinical_trials_gov_condition.id'), - index=True), - Column('clinical_trials_gov_id', Integer, - ForeignKey('clinical_trials_gov.id'), index=True) - ) - -ctg_mesh_term_n2m = Table('clinical_trials_gov__mesh_term', Base.metadata, - Column('clinical_trials_gov_mesh_term_id', Integer, - ForeignKey('clinical_trials_gov_mesh_term.id'), - index=True), - Column('clinical_trials_gov_id', Integer, - ForeignKey('clinical_trials_gov.id'), index=True) - ) - -ctg_intervention_n2m = Table('clinical_trials_gov__intervention', Base.metadata, - Column('clinical_trials_gov_intervention_id', Integer, - ForeignKey('clinical_trials_gov_intervention.id'), - index=True), - Column('clinical_trials_gov_id', Integer, - ForeignKey('clinical_trials_gov.id'), index=True) - ) +ctg_keyword_n2m = Table( + "clinical_trials_gov__keyword", + Base.metadata, + Column( + "clinical_trials_gov_keyword_id", + Integer, + ForeignKey("clinical_trials_gov_keyword.id"), + index=True, + ), + Column( + "clinical_trials_gov_id", + Integer, + ForeignKey("clinical_trials_gov.id"), + index=True, + ), +) + +ctg_condition_n2m = Table( + "clinical_trials_gov__condition", + Base.metadata, + Column( + "clinical_trials_gov_condition_id", + Integer, + ForeignKey("clinical_trials_gov_condition.id"), + index=True, + ), + Column( + "clinical_trials_gov_id", + Integer, + ForeignKey("clinical_trials_gov.id"), + index=True, + ), +) + +ctg_mesh_term_n2m = Table( + "clinical_trials_gov__mesh_term", + Base.metadata, + Column( + "clinical_trials_gov_mesh_term_id", + Integer, + ForeignKey("clinical_trials_gov_mesh_term.id"), + index=True, + ), + Column( + "clinical_trials_gov_id", + Integer, + ForeignKey("clinical_trials_gov.id"), + index=True, + ), +) + +ctg_intervention_n2m = Table( + "clinical_trials_gov__intervention", + Base.metadata, + Column( + "clinical_trials_gov_intervention_id", + Integer, + ForeignKey("clinical_trials_gov_intervention.id"), + index=True, + ), + Column( + "clinical_trials_gov_id", + Integer, + ForeignKey("clinical_trials_gov.id"), + index=True, + ), +) class ClinicalTrialGov(Base): """Class definition for the clinical_trials_gov table.""" - __tablename__ = 'clinical_trials_gov' + __tablename__ = "clinical_trials_gov" id = Column(Integer, primary_key=True) nct_id = Column(String(100), index=True) @@ -72,117 +108,111 @@ class ClinicalTrialGov(Base): "Keyword", secondary=ctg_keyword_n2m, back_populates="trials", - cascade="save-update" + cascade="save-update", ) conditions = relationship( "Condition", secondary=ctg_condition_n2m, back_populates="trials", - cascade="save-update" + cascade="save-update", ) mesh_terms = relationship( "MeshTerm", secondary=ctg_mesh_term_n2m, back_populates="trials", - cascade="save-update" + cascade="save-update", ) interventions = relationship( "Intervention", secondary=ctg_intervention_n2m, back_populates="trials", - cascade="save-update" + cascade="save-update", ) def as_dict(self): """Convert object values to dictionary.""" basic_dict = object_as_dict(self) - basic_dict['brief_summary'] = re.sub(r'\r?\n\s*', ' ', basic_dict['brief_summary']).strip() - basic_dict.update({'keywords': [x.keyword for x in self.keywords]}) - basic_dict.update({'conditions': [x.condition for x in self.conditions]}) - basic_dict.update({'mesh_terms': [x.mesh_term for x in self.mesh_terms]}) - basic_dict.update({'interventions': [ - {'intervention_type': x.intervention_type, - 'intervention_name': x.intervention_name} for x in self.interventions]}) + basic_dict["brief_summary"] = re.sub(r"\r?\n\s*", " ", basic_dict["brief_summary"]).strip() + basic_dict.update({"keywords": [x.keyword for x in self.keywords]}) + basic_dict.update({"conditions": [x.condition for x in self.conditions]}) + basic_dict.update({"mesh_terms": [x.mesh_term for x in self.mesh_terms]}) + basic_dict.update( + { + "interventions": [ + { + "intervention_type": x.intervention_type, + "intervention_name": x.intervention_name, + } + for x in self.interventions + ] + } + ) return basic_dict class Keyword(Base): """Class definition for the clinical_trials_gov_keyword table.""" - __tablename__ = 'clinical_trials_gov_keyword' + __tablename__ = "clinical_trials_gov_keyword" id = Column(Integer, primary_key=True) keyword = Column(String(255), index=True) - trials = relationship( - "ClinicalTrialGov", - secondary=ctg_keyword_n2m, - back_populates="keywords") + trials = relationship("ClinicalTrialGov", secondary=ctg_keyword_n2m, back_populates="keywords") def as_dict(self): """Convert object values to dictionary.""" - return { - 'keyword': self.keyword, - 'nct_ids': [x.nct_id for x in self.trials] - } + return {"keyword": self.keyword, "nct_ids": [x.nct_id for x in self.trials]} class Condition(Base): """Class definition for the clinical_trials_gov_condition table.""" - __tablename__ = 'clinical_trials_gov_condition' + __tablename__ = "clinical_trials_gov_condition" id = Column(Integer, primary_key=True) condition = Column(Text) - trials = relationship( - "ClinicalTrialGov", - secondary=ctg_condition_n2m, - back_populates="conditions") + trials = relationship("ClinicalTrialGov", secondary=ctg_condition_n2m, back_populates="conditions") def as_dict(self): """Convert object values to dictionary.""" - return { - 'condition': self.condition, - 'nct_ids': [x.nct_id for x in self.trials] - } + return {"condition": self.condition, "nct_ids": [x.nct_id for x in self.trials]} class MeshTerm(Base): """Class definition for the clinical_trials_gov_mesh_term table.""" - __tablename__ = 'clinical_trials_gov_mesh_term' + __tablename__ = "clinical_trials_gov_mesh_term" id = Column(Integer, primary_key=True) mesh_term = Column(String(100), unique=True) - trials = relationship( - "ClinicalTrialGov", - secondary=ctg_mesh_term_n2m, - back_populates="mesh_terms") + trials = relationship("ClinicalTrialGov", secondary=ctg_mesh_term_n2m, back_populates="mesh_terms") def as_dict(self): """Convert object values to dictionary.""" return { - 'mesh_term': self.mesh_term, - 'number_of_trials': len(self.trials), - 'nct_ids': [x.nct_id for x in self.trials] + "mesh_term": self.mesh_term, + "number_of_trials": len(self.trials), + "nct_ids": [x.nct_id for x in self.trials], } class Intervention(Base): """Class definition for the clinical_trials_gov_intervention table.""" - __tablename__ = 'clinical_trials_gov_intervention' + __tablename__ = "clinical_trials_gov_intervention" id = Column(Integer, primary_key=True) intervention_type = Column(String(100), index=True) intervention_name = Column(String(255), index=True) trials = relationship( "ClinicalTrialGov", secondary=ctg_intervention_n2m, - back_populates="interventions") + back_populates="interventions", + ) def as_dict(self): """Convert object values to dictionary.""" return { - 'intervention_type': self.intervention_type, - 'intervention_name': self.intervention_name, - 'nct_ids': [x.nct_id for x in self.trials] + "intervention_type": self.intervention_type, + "intervention_name": self.intervention_name, + "nct_ids": [x.nct_id for x in self.trials], } diff --git a/ebel/manager/rdbms/models/clinvar.py b/ebel/manager/rdbms/models/clinvar.py index 8a1da4f..a7995ab 100644 --- a/ebel/manager/rdbms/models/clinvar.py +++ b/ebel/manager/rdbms/models/clinvar.py @@ -1,50 +1,51 @@ """ClinVar RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import Column, ForeignKey, Index, Integer, String, Table, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Table, Text, ForeignKey, Index +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict - Base = declarative_base() -clinvar__clinvar_phenotype = Table('clinvar__phenotype', Base.metadata, - Column('clinvar_id', Integer, ForeignKey('clinvar.id')), - Column('clinvar_phenotype_id', Integer, ForeignKey('clinvar_phenotype.id')) - ) +clinvar__clinvar_phenotype = Table( + "clinvar__phenotype", + Base.metadata, + Column("clinvar_id", Integer, ForeignKey("clinvar.id")), + Column("clinvar_phenotype_id", Integer, ForeignKey("clinvar_phenotype.id")), +) class ClinvarPhenotypeMedgen(Base): """Class definition for the clinvar_phenotype_medgen table.""" - __tablename__ = 'clinvar_phenotype_medgen' + __tablename__ = "clinvar_phenotype_medgen" id = Column(Integer, primary_key=True) identifier = Column(String(100), index=True) - clinvar_id = Column(Integer, ForeignKey('clinvar.id')) + clinvar_id = Column(Integer, ForeignKey("clinvar.id")) clinvar = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) class ClinvarOtherIdentifier(Base): """Class definition for the clinvar_other_identifier table.""" - __tablename__ = 'clinvar_other_identifier' + __tablename__ = "clinvar_other_identifier" id = Column(Integer, primary_key=True) db = Column(String(100), index=True) identifier = Column(String(100), index=True) - clinvar_id = Column(Integer, ForeignKey('clinvar.id')) + clinvar_id = Column(Integer, ForeignKey("clinvar.id")) clinvar = relationship("Clinvar", foreign_keys=[clinvar_id], viewonly=True) def as_dict(self): """Convert object values to dictionary.""" - return {'db': self.db, 'identifier': self.identifier} + return {"db": self.db, "identifier": self.identifier} class Clinvar(Base): """Class definition for the clinvar table.""" - __tablename__ = 'clinvar' + __tablename__ = "clinvar" id = Column(Integer, primary_key=True) allele_id = Column(Integer) @@ -82,32 +83,27 @@ class Clinvar(Base): phenotypeMedgens = relationship("ClinvarPhenotypeMedgen", foreign_keys=[ClinvarPhenotypeMedgen.clinvar_id]) otherIdentifiers = relationship("ClinvarOtherIdentifier", foreign_keys=[ClinvarOtherIdentifier.clinvar_id]) - phenotypes = relationship( - "ClinvarPhenotype", - secondary=clinvar__clinvar_phenotype) + phenotypes = relationship("ClinvarPhenotype", secondary=clinvar__clinvar_phenotype) - __table_args__ = (Index('ix_clinvar__gene_symbol', gene_symbol, mysql_length=500),) + __table_args__ = (Index("ix_clinvar__gene_symbol", gene_symbol, mysql_length=500),) def as_dict(self): """Convert object values to dictionary.""" clinvar_entry = object_as_dict(self) - clinvar_entry.update({'phenotypeMedgens': [x.identifier for x in self.phenotypeMedgens]}) - clinvar_entry.update({'otherIdentifiers': [x.as_dict() for x in self.otherIdentifiers]}) + clinvar_entry.update({"phenotypeMedgens": [x.identifier for x in self.phenotypeMedgens]}) + clinvar_entry.update({"otherIdentifiers": [x.as_dict() for x in self.otherIdentifiers]}) return clinvar_entry class ClinvarPhenotype(Base): """Class definition for the clinvar_phenotype table.""" - __tablename__ = 'clinvar_phenotype' + __tablename__ = "clinvar_phenotype" id = Column(Integer, primary_key=True) phenotype = Column(Text) - clinvars = relationship( - "Clinvar", - secondary=clinvar__clinvar_phenotype, - back_populates="phenotypes") + clinvars = relationship("Clinvar", secondary=clinvar__clinvar_phenotype, back_populates="phenotypes") def as_dict(self): """Convert object values to dictionary.""" - return {'phenotype': self.phenotype} + return {"phenotype": self.phenotype} diff --git a/ebel/manager/rdbms/models/disgenet.py b/ebel/manager/rdbms/models/disgenet.py index 9b00bff..cd32bcf 100644 --- a/ebel/manager/rdbms/models/disgenet.py +++ b/ebel/manager/rdbms/models/disgenet.py @@ -1,7 +1,7 @@ """DisGeNet RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import BigInteger, Column, Float, ForeignKey, Integer, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Float, BigInteger, ForeignKey +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict @@ -11,33 +11,35 @@ class DisgenetGene(Base): """Class definition for the disgenet_gene table.""" - __tablename__ = 'disgenet_gene' + __tablename__ = "disgenet_gene" id = Column(Integer, primary_key=True) - gene_id = Column(Integer, ForeignKey('disgenet_gene_symbol.gene_id')) - gene_symbol = relationship("DisgenetGeneSymbol", back_populates='gene_disease_pmid_associations') - disease_id = Column(String(100), ForeignKey('disgenet_disease.disease_id')) + gene_id = Column(Integer, ForeignKey("disgenet_gene_symbol.gene_id")) + gene_symbol = relationship("DisgenetGeneSymbol", back_populates="gene_disease_pmid_associations") + disease_id = Column(String(100), ForeignKey("disgenet_disease.disease_id")) disease = relationship("DisgenetDisease", foreign_keys=[disease_id]) score = Column(Float) pmid = Column(BigInteger) - source_id = Column(Integer, ForeignKey('disgenet_source.id')) + source_id = Column(Integer, ForeignKey("disgenet_source.id")) source = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): """Convert object values to dictionary.""" - rs = object_as_dict(self, exclude=['id', 'source_id']) - rs.update({ - 'gene_symbol': self.gene_symbol.gene_symbol, - 'disease_name': self.disease.disease_name, - 'source': self.source.source - }) + rs = object_as_dict(self, exclude=["id", "source_id"]) + rs.update( + { + "gene_symbol": self.gene_symbol.gene_symbol, + "disease_name": self.disease.disease_name, + "source": self.source.source, + } + ) return rs class DisgenetGeneSymbol(Base): """Class definition for the disgenet_gene_symbol table.""" - __tablename__ = 'disgenet_gene_symbol' + __tablename__ = "disgenet_gene_symbol" gene_id = Column(Integer, primary_key=True) gene_symbol = Column(String(50), index=True) @@ -51,33 +53,30 @@ def as_dict(self): class DisgenetVariant(Base): """Class definition for the disgenet_variant table.""" - __tablename__ = 'disgenet_variant' + __tablename__ = "disgenet_variant" id = Column(Integer, primary_key=True) snp_id = Column(String(20), index=True) chromosome = Column(String(2)) position = Column(BigInteger) - disease_id = Column(String(100), ForeignKey('disgenet_disease.disease_id')) + disease_id = Column(String(100), ForeignKey("disgenet_disease.disease_id")) disease = relationship("DisgenetDisease", foreign_keys=[disease_id]) score = Column(Float) pmid = Column(BigInteger, index=True) - source_id = Column(Integer, ForeignKey('disgenet_source.id')) + source_id = Column(Integer, ForeignKey("disgenet_source.id")) source = relationship("DisgenetSource", foreign_keys=[source_id]) def as_dict(self): """Convert object values to dictionary.""" - rs = object_as_dict(self, exclude=['id', 'source_id']) - rs.update({ - 'disease_name': self.disease.disease_name, - 'source': self.source.source - }) + rs = object_as_dict(self, exclude=["id", "source_id"]) + rs.update({"disease_name": self.disease.disease_name, "source": self.source.source}) return rs class DisgenetDisease(Base): """Class definition for the disgenet_disease table.""" - __tablename__ = 'disgenet_disease' + __tablename__ = "disgenet_disease" disease_id = Column(String(100), primary_key=True) disease_name = Column(String(255), index=True) @@ -89,10 +88,10 @@ def as_dict(self): class DisgenetSource(Base): """Class definition for the disgenet_source table.""" - __tablename__ = 'disgenet_source' + __tablename__ = "disgenet_source" id = Column(Integer, primary_key=True) source = Column(String(100), index=True) def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) diff --git a/ebel/manager/rdbms/models/drugbank.py b/ebel/manager/rdbms/models/drugbank.py index 8197820..e045bba 100644 --- a/ebel/manager/rdbms/models/drugbank.py +++ b/ebel/manager/rdbms/models/drugbank.py @@ -1,8 +1,8 @@ """DrugBank RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Date, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Text, ForeignKey, Date +from sqlalchemy.orm import relationship Base = declarative_base() @@ -10,7 +10,7 @@ class Drugbank(Base): """Class definition for the drugbank table.""" - __tablename__ = 'drugbank' + __tablename__ = "drugbank" id = Column(Integer, primary_key=True) drugbank_id = Column(String(10), index=True) name = Column(String(255)) @@ -47,39 +47,40 @@ def __str__(self): # TODO: add drug_interaction def as_dict(self): """Convert object values to dictionary.""" - return {'drugbank_id': self.drugbank_id, - 'name': self.name, - 'description': self.description, - 'cas_number': self.cas_number, - 'toxicity': self.toxicity, - 'indication': self.indication, - 'pharmacodynamics': self.pharmacodynamics, - 'metabolism': self.metabolism, - 'absorption': self.absorption, - 'half_life': self.half_life, - 'route_of_elimination': self.route_of_elimination, - 'volume_of_distribution': self.volume_of_distribution, - 'clearance': self.clearance, - 'mechanism_of_action': self.mechanism_of_action, - 'fda_label': self.fda_label, - 'references': [x.pmid for x in self.references], - 'pathways': [x.smpdb_id for x in self.pathways], - 'patents': [x.as_dict() for x in self.patents], - 'targets': [x.as_dict() for x in self.targets], - 'product_names': [x.name for x in self.product_names], - 'external_identifiers': [x.as_dict() for x in self.external_identifiers], - 'statuses': [x.status for x in self.statuses], - } + return { + "drugbank_id": self.drugbank_id, + "name": self.name, + "description": self.description, + "cas_number": self.cas_number, + "toxicity": self.toxicity, + "indication": self.indication, + "pharmacodynamics": self.pharmacodynamics, + "metabolism": self.metabolism, + "absorption": self.absorption, + "half_life": self.half_life, + "route_of_elimination": self.route_of_elimination, + "volume_of_distribution": self.volume_of_distribution, + "clearance": self.clearance, + "mechanism_of_action": self.mechanism_of_action, + "fda_label": self.fda_label, + "references": [x.pmid for x in self.references], + "pathways": [x.smpdb_id for x in self.pathways], + "patents": [x.as_dict() for x in self.patents], + "targets": [x.as_dict() for x in self.targets], + "product_names": [x.name for x in self.product_names], + "external_identifiers": [x.as_dict() for x in self.external_identifiers], + "statuses": [x.status for x in self.statuses], + } class Pathway(Base): """Class definition for the drugbank_pathway table.""" - __tablename__ = 'drugbank_pathway' + __tablename__ = "drugbank_pathway" id = Column(Integer, primary_key=True) smpdb_id = Column(String(255)) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="pathways") def __str__(self): @@ -87,13 +88,13 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return {'smpdb_id': self.smpdb_id, 'drugbank_id': self.drugbank.drugbank_id} + return {"smpdb_id": self.smpdb_id, "drugbank_id": self.drugbank.drugbank_id} class Patent(Base): """Class definition for the drugbank_patent table.""" - __tablename__ = 'drugbank_patent' + __tablename__ = "drugbank_patent" id = Column(Integer, primary_key=True) number = Column(String(255)) country = Column(String(255)) @@ -101,7 +102,7 @@ class Patent(Base): expires = Column(Date) pediatric_extension = Column(String(255)) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="patents") def __str__(self): @@ -109,23 +110,24 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return {'number': self.number, - 'country': self.country, - 'approved': self.approved.strftime("%Y-%m-%d"), - 'expires': self.expires.strftime("%Y-%m-%d"), - 'pediatric_extension': self.pediatric_extension, - 'drugbank_id': self.drugbank.drugbank_id - } + return { + "number": self.number, + "country": self.country, + "approved": self.approved.strftime("%Y-%m-%d"), + "expires": self.expires.strftime("%Y-%m-%d"), + "pediatric_extension": self.pediatric_extension, + "drugbank_id": self.drugbank.drugbank_id, + } class Status(Base): """Class definition for the drugbank_status table.""" - __tablename__ = 'drugbank_status' + __tablename__ = "drugbank_status" id = Column(Integer, primary_key=True) status = Column(String(20), index=True) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="statuses") def __str__(self): @@ -133,18 +135,18 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return {'smpdb_id': self.status, 'drugbank_id': self.drugbank.drugbank_id} + return {"smpdb_id": self.status, "drugbank_id": self.drugbank.drugbank_id} class ExternalIdentifier(Base): """Class definition for the drugbank_external_identifier table.""" - __tablename__ = 'drugbank_external_identifier' + __tablename__ = "drugbank_external_identifier" id = Column(Integer, primary_key=True) - resource = Column(String(255)) + resource = Column(String(255), index=True) identifier = Column(String(255), index=True) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="external_identifiers") def __str__(self): @@ -152,17 +154,21 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return {'resource': self.resource, 'identifier': self.identifier, 'drugbank_id': self.drugbank.drugbank_id} + return { + "resource": self.resource, + "identifier": self.identifier, + "drugbank_id": self.drugbank.drugbank_id, + } class Reference(Base): """Class definition for the drugbank_reference table.""" - __tablename__ = 'drugbank_reference' + __tablename__ = "drugbank_reference" id = Column(Integer, primary_key=True) pmid = Column(Integer) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="references") def __str__(self): @@ -170,19 +176,19 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return {'pmid': self.pmid, 'drugbank_id': self.drugbank.drugbank_id} + return {"pmid": self.pmid, "drugbank_id": self.drugbank.drugbank_id} class Target(Base): """Class definition for the drugbank_target table.""" - __tablename__ = 'drugbank_target' + __tablename__ = "drugbank_target" id = Column(Integer, primary_key=True) uniprot = Column(String(20), index=True) action = Column(String(50), index=True) known_action = Column(String(20), index=True) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="targets") def __str__(self): @@ -191,23 +197,23 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'uniprot': self.uniprot, - 'action': self.action, - 'known_action': self.known_action, - 'drugbank_id': self.drugbank.drugbank_id + "uniprot": self.uniprot, + "action": self.action, + "known_action": self.known_action, + "drugbank_id": self.drugbank.drugbank_id, } class DrugInteraction(Base): """Class definition for the drugbank_drug_interaction table.""" - __tablename__ = 'drugbank_drug_interaction' + __tablename__ = "drugbank_drug_interaction" id = Column(Integer, primary_key=True) drugbank_id = Column(String(10), index=True) name = Column(Text) description = Column(Text) - db_id = Column(Integer, ForeignKey('drugbank.id')) # exception because drugbank_id is already a field + db_id = Column(Integer, ForeignKey("drugbank.id")) # exception because drugbank_id is already a field drugbank = relationship("Drugbank", back_populates="drug_interactions") def __str__(self): @@ -216,21 +222,21 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" return { - 'drugbank_id': self.drugbank_id, - 'name': self.name, - 'description': self.description, - 'interactor_drugbank_id': self.drugbank.drugbank_id + "drugbank_id": self.drugbank_id, + "name": self.name, + "description": self.description, + "interactor_drugbank_id": self.drugbank.drugbank_id, } class ProductName(Base): """Class definition for the drugbank_product_name table.""" - __tablename__ = 'drugbank_product_name' + __tablename__ = "drugbank_product_name" id = Column(Integer, primary_key=True) name = Column(Text) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="product_names") def __str__(self): @@ -238,20 +244,17 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return { - 'drugbank_id': self.drugbank.drugbank_id, - 'name': self.name - } + return {"drugbank_id": self.drugbank.drugbank_id, "name": self.name} class Synonym(Base): """Class definition for the drugbank_synonym table.""" - __tablename__ = 'drugbank_synonym' + __tablename__ = "drugbank_synonym" id = Column(Integer, primary_key=True) synonym = Column(Text) - drugbank_id = Column(Integer, ForeignKey('drugbank.id')) + drugbank_id = Column(Integer, ForeignKey("drugbank.id")) drugbank = relationship("Drugbank", back_populates="synonyms") def __str__(self): @@ -259,7 +262,4 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" - return { - 'drugbank_id': self.drugbank.drugbank_id, - 'synonym': self.synonym - } + return {"drugbank_id": self.drugbank.drugbank_id, "synonym": self.synonym} diff --git a/ebel/manager/rdbms/models/ensembl.py b/ebel/manager/rdbms/models/ensembl.py index 30d336b..6c7115e 100644 --- a/ebel/manager/rdbms/models/ensembl.py +++ b/ebel/manager/rdbms/models/ensembl.py @@ -11,7 +11,7 @@ class Ensembl(Base): """Class definition for the ensembl table.""" - __tablename__ = 'ensembl' + __tablename__ = "ensembl" id = Column(Integer, primary_key=True) enst = Column(String(20), index=True) version = Column(Integer) diff --git a/ebel/manager/rdbms/models/expression_atlas.py b/ebel/manager/rdbms/models/expression_atlas.py index 243ad14..ce70217 100644 --- a/ebel/manager/rdbms/models/expression_atlas.py +++ b/ebel/manager/rdbms/models/expression_atlas.py @@ -1,5 +1,5 @@ """Expression Atlas RDBMS model definition.""" -from sqlalchemy import Column, Integer, String, Text, ForeignKey, Float +from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship @@ -11,7 +11,7 @@ class Experiment(Base): """Table definition for experiment.""" - __tablename__ = 'expression_atlas_experiment' + __tablename__ = "expression_atlas_experiment" id = Column(Integer, primary_key=True) @@ -25,23 +25,23 @@ class Experiment(Base): def as_dict(self): """Convert object values to dictionary.""" experiment = object_as_dict(self) - experiment.update({'idfs': {idf.key_name: idf.value for idf in self.idfs}}) - gc = [{'groups': x.group_comparison, 'name': x.name, 'id': x.id} for x in self.group_comparisons] - experiment.update({'group_comparison': gc}) + experiment.update({"idfs": {idf.key_name: idf.value for idf in self.idfs}}) + gc = [{"groups": x.group_comparison, "name": x.name, "id": x.id} for x in self.group_comparisons] + experiment.update({"group_comparison": gc}) return experiment class Idf(Base): """Table definition for IDF.""" - __tablename__ = 'expression_atlas_idf' + __tablename__ = "expression_atlas_idf" id = Column(Integer, primary_key=True) key_name = Column(Text, nullable=False) value = Column(Text, nullable=False) - experiment_id = Column(Integer, ForeignKey('expression_atlas_experiment.id')) + experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) experiment = relationship("Experiment", back_populates="idfs") def as_dict(self): @@ -52,11 +52,11 @@ def as_dict(self): class GroupComparison(Base): """Table definition for group comparison.""" - __tablename__ = 'expression_atlas_group_comparison' + __tablename__ = "expression_atlas_group_comparison" id = Column(Integer, primary_key=True) - experiment_id = Column(Integer, ForeignKey('expression_atlas_experiment.id')) + experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) experiment = relationship("Experiment", back_populates="group_comparisons") group_comparison = Column(String(100)) @@ -73,7 +73,7 @@ def as_dict(self): class FoldChange(Base): """Table definition for fold changes.""" - __tablename__ = 'expression_atlas_foldchange' + __tablename__ = "expression_atlas_foldchange" id = Column(Integer, primary_key=True) @@ -83,7 +83,7 @@ class FoldChange(Base): p_value = Column(Float, index=True) t_statistic = Column(Float) - group_comparison_id = Column(Integer, ForeignKey('expression_atlas_group_comparison.id')) + group_comparison_id = Column(Integer, ForeignKey("expression_atlas_group_comparison.id")) group_comparison = relationship("GroupComparison", back_populates="fold_changes") def as_dict(self): @@ -94,11 +94,11 @@ def as_dict(self): class SdrfCondensed(Base): """Table definition for SDRF condensed.""" - __tablename__ = 'expression_atlas_sdrf_condensed' + __tablename__ = "expression_atlas_sdrf_condensed" id = Column(Integer, primary_key=True) - experiment_id = Column(Integer, ForeignKey('expression_atlas_experiment.id')) + experiment_id = Column(Integer, ForeignKey("expression_atlas_experiment.id")) experiment = relationship("Experiment", back_populates="sdrf_condenseds") method = Column(String(255)) @@ -116,11 +116,11 @@ def as_dict(self): class Gsea(Base): """Table definition for Genset enrichment table.""" - __tablename__ = 'expression_atlas_gsea' + __tablename__ = "expression_atlas_gsea" id = Column(Integer, primary_key=True) - group_comparison_id = Column(Integer, ForeignKey('expression_atlas_group_comparison.id')) + group_comparison_id = Column(Integer, ForeignKey("expression_atlas_group_comparison.id")) group_comparison = relationship("GroupComparison", back_populates="gseas") term = Column(String(255), index=True) diff --git a/ebel/manager/rdbms/models/gwas_catalog.py b/ebel/manager/rdbms/models/gwas_catalog.py index f915b59..97a6f8c 100644 --- a/ebel/manager/rdbms/models/gwas_catalog.py +++ b/ebel/manager/rdbms/models/gwas_catalog.py @@ -1,7 +1,7 @@ """GWAS Catalog RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Float, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Float, Text, ForeignKey +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict @@ -11,7 +11,7 @@ class GwasCatalog(Base): """Class definition for the gwascatalog table.""" - __tablename__ = 'gwascatalog' + __tablename__ = "gwascatalog" id = Column(Integer, primary_key=True) date_added_to_catalog = Column(String(255)) pubmedid = Column(Integer) @@ -52,15 +52,15 @@ class GwasCatalog(Base): def as_dict(self): """Convert object values to dictionary.""" gwas_catalog = object_as_dict(self) - gwas_catalog.update({'snp_genes': [x.ensembl_identifier for x in self.snp_genes]}) + gwas_catalog.update({"snp_genes": [x.ensembl_identifier for x in self.snp_genes]}) return gwas_catalog class SnpGene(Base): """Class definition for the gwascatalog_snpgene table.""" - __tablename__ = 'gwascatalog_snpgene' + __tablename__ = "gwascatalog_snpgene" id = Column(Integer, primary_key=True) ensembl_identifier = Column(String(100), nullable=False, index=True) - gwascatalog_id = Column(Integer, ForeignKey('gwascatalog.id')) + gwascatalog_id = Column(Integer, ForeignKey("gwascatalog.id")) gwascatalog = relationship("GwasCatalog", back_populates="snp_genes") diff --git a/ebel/manager/rdbms/models/hgnc.py b/ebel/manager/rdbms/models/hgnc.py index 46d3796..da21ff4 100644 --- a/ebel/manager/rdbms/models/hgnc.py +++ b/ebel/manager/rdbms/models/hgnc.py @@ -1,7 +1,8 @@ """HGNC RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import (BigInteger, Column, Date, ForeignKey, Integer, String, + Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, BigInteger, Text, ForeignKey, Date +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict @@ -11,7 +12,7 @@ class Hgnc(Base): """Class definition for the hgnc table.""" - __tablename__ = 'hgnc' + __tablename__ = "hgnc" id = Column(Integer, primary_key=True) hgnc_id = Column(String(20)) version = Column(BigInteger) @@ -27,7 +28,6 @@ class Hgnc(Base): homeodb = Column(Integer) horde_id = Column(String(50)) imgt = Column(String(50)) - intermediate_filament_db = Column(String(50)) iuphar = Column(String(50)) kznf_gene_catalog = Column(Integer) lncipedia = Column(String(50)) @@ -36,12 +36,10 @@ class Hgnc(Base): location_sortable = Column(String(100)) locus_group = Column(String(50)) locus_type = Column(String(50)) - mamit_trnadb = Column(Integer) merops = Column(String(20)) mirbase = Column(String(20)) name = Column(String(255)) orphanet = Column(Integer) - pseudogene_org = Column(String(50)) snornabase = Column(String(20)) status = Column(String(50)) symbol = Column(String(100), index=True) @@ -72,71 +70,68 @@ class Hgnc(Base): def as_dict(self): """Convert object values to dictionary.""" return { - 'hgnc_id': self.hgnc_id, - 'version': self.version, - 'bioparadigms_slc': self.bioparadigms_slc, - 'cd': self.cd, - 'cosmic': self.cosmic, - 'date_approved_reserved': self.date_approved_reserved, - 'date_modified': self.date_modified, - 'date_name_changed': self.date_name_changed, - 'date_symbol_changed': self.date_symbol_changed, - 'ensembl_gene_id': self.ensembl_gene_id, - 'entrez_id': self.entrez_id, - 'homeodb': self.homeodb, - 'horde_id': self.horde_id, - 'imgt': self.imgt, - 'intermediate_filament_db': self.intermediate_filament_db, - 'iuphar': self.iuphar, - 'kznf_gene_catalog': self.kznf_gene_catalog, - 'lncipedia': self.lncipedia, - 'lncrnadb': self.lncrnadb, - 'location': self.location, - 'location_sortable': self.location_sortable, - 'locus_group': self.locus_group, - 'locus_type': self.locus_type, - 'mamit_trnadb': self.mamit_trnadb, - 'merops': self.merops, - 'mirbase': self.mirbase, - 'name': self.name, - 'orphanet': self.orphanet, - 'pseudogene_org': self.pseudogene_org, - 'snornabase': self.snornabase, - 'status': self.status, - 'symbol': self.symbol, - 'ucsc_id': self.ucsc_id, - 'uuid': self.uuid, - 'vega_id': self.vega_id, - 'agr': self.agr, - 'pre_symbols': [x.prev_symbol for x in self.pre_symbols], - 'alias_names': [x.alias_name for x in self.alias_names], - 'alias_symbols': [x.alias_symbol for x in self.alias_symbols], - 'ccdss': [x.identifier for x in self.ccdss], - 'enas': [x.identifier for x in self.enas], - 'enzymes': [x.ec_number for x in self.enzymes], - 'gene_group_names': [x.name for x in self.gene_group_names], - 'gene_group_ids': [x.identifier for x in self.gene_group_ids], - 'uniprots': [x.accession for x in self.uniprots], - 'rna_centrals': [x.identifier for x in self.rna_centrals], - 'rgds': [x.identifier for x in self.rgds], - 'refseqs': [x.accession for x in self.refseqs], - 'pubmeds': [x.pmid for x in self.pubmeds], - 'prev_names': [x.prev_name for x in self.prev_names], - 'omims': [x.identifier for x in self.omims], - 'mgds': [x.identifier for x in self.mgds], - 'lsdbs': [x.identifier for x in self.lsdbs] + "hgnc_id": self.hgnc_id, + "version": self.version, + "bioparadigms_slc": self.bioparadigms_slc, + "cd": self.cd, + "cosmic": self.cosmic, + "date_approved_reserved": self.date_approved_reserved, + "date_modified": self.date_modified, + "date_name_changed": self.date_name_changed, + "date_symbol_changed": self.date_symbol_changed, + "ensembl_gene_id": self.ensembl_gene_id, + "entrez_id": self.entrez_id, + "homeodb": self.homeodb, + "horde_id": self.horde_id, + "imgt": self.imgt, + "iuphar": self.iuphar, + "kznf_gene_catalog": self.kznf_gene_catalog, + "lncipedia": self.lncipedia, + "lncrnadb": self.lncrnadb, + "location": self.location, + "location_sortable": self.location_sortable, + "locus_group": self.locus_group, + "locus_type": self.locus_type, + "merops": self.merops, + "mirbase": self.mirbase, + "name": self.name, + "orphanet": self.orphanet, + "snornabase": self.snornabase, + "status": self.status, + "symbol": self.symbol, + "ucsc_id": self.ucsc_id, + "uuid": self.uuid, + "vega_id": self.vega_id, + "agr": self.agr, + "pre_symbols": [x.prev_symbol for x in self.pre_symbols], + "alias_names": [x.alias_name for x in self.alias_names], + "alias_symbols": [x.alias_symbol for x in self.alias_symbols], + "ccdss": [x.identifier for x in self.ccdss], + "enas": [x.identifier for x in self.enas], + "enzymes": [x.ec_number for x in self.enzymes], + "gene_group_names": [x.name for x in self.gene_group_names], + "gene_group_ids": [x.identifier for x in self.gene_group_ids], + "uniprots": [x.accession for x in self.uniprots], + "rna_centrals": [x.identifier for x in self.rna_centrals], + "rgds": [x.identifier for x in self.rgds], + "refseqs": [x.accession for x in self.refseqs], + "pubmeds": [x.pmid for x in self.pubmeds], + "prev_names": [x.prev_name for x in self.prev_names], + "omims": [x.identifier for x in self.omims], + "mgds": [x.identifier for x in self.mgds], + "lsdbs": [x.identifier for x in self.lsdbs], } class PrevSymbol(Base): """Class definition for the hgnc_prev_symbol table.""" - __tablename__ = 'hgnc_prev_symbol' + __tablename__ = "hgnc_prev_symbol" id = Column(Integer, primary_key=True) prev_symbol = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="pre_symbols") def __str__(self): @@ -146,12 +141,12 @@ def __str__(self): class AliasName(Base): """Class definition for the hgnc_alias_name table.""" - __tablename__ = 'hgnc_alias_name' + __tablename__ = "hgnc_alias_name" id = Column(Integer, primary_key=True) alias_name = Column(String(255)) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="alias_names") def __str__(self): @@ -161,12 +156,12 @@ def __str__(self): class AliasSymbol(Base): """Class definition for the hgnc_alias_symbol table.""" - __tablename__ = 'hgnc_alias_symbol' + __tablename__ = "hgnc_alias_symbol" id = Column(Integer, primary_key=True) alias_symbol = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="alias_symbols") def __str__(self): @@ -176,12 +171,12 @@ def __str__(self): class Ccds(Base): """Class definition for the hgnc_ccds table.""" - __tablename__ = 'hgnc_ccds' + __tablename__ = "hgnc_ccds" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="ccdss") def __str__(self): @@ -191,12 +186,12 @@ def __str__(self): class Ena(Base): """Class definition for the hgnc_ena table.""" - __tablename__ = 'hgnc_ena' + __tablename__ = "hgnc_ena" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="enas") def __str__(self): @@ -206,12 +201,12 @@ def __str__(self): class Enzyme(Base): """Class definition for the hgnc_enzyme table.""" - __tablename__ = 'hgnc_enzyme' + __tablename__ = "hgnc_enzyme" id = Column(Integer, primary_key=True) ec_number = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="enzymes") def __str__(self): @@ -221,12 +216,12 @@ def __str__(self): class GeneGroupName(Base): """Class definition for the hgnc_gene_group_name table.""" - __tablename__ = 'hgnc_gene_group_name' + __tablename__ = "hgnc_gene_group_name" id = Column(Integer, primary_key=True) name = Column(String(255)) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="gene_group_names") def __str__(self): @@ -240,12 +235,12 @@ def as_dict(self): class GeneGroupId(Base): """Class definition for the hgnc_gene_group_id table.""" - __tablename__ = 'hgnc_gene_group_id' + __tablename__ = "hgnc_gene_group_id" id = Column(Integer, primary_key=True) identifier = Column(Integer) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="gene_group_ids") def __str__(self): @@ -255,12 +250,12 @@ def __str__(self): class UniProt(Base): """Class definition for the hgnc_uniprot table.""" - __tablename__ = 'hgnc_uniprot' + __tablename__ = "hgnc_uniprot" id = Column(Integer, primary_key=True) accession = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="uniprots") def __str__(self): @@ -270,12 +265,12 @@ def __str__(self): class RnaCentral(Base): """Class definition for the hgnc_rna_central table.""" - __tablename__ = 'hgnc_rna_central' + __tablename__ = "hgnc_rna_central" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="rna_centrals") def __str__(self): @@ -285,12 +280,12 @@ def __str__(self): class Rgd(Base): """Class definition for the hgnc_rgd table.""" - __tablename__ = 'hgnc_rgd' + __tablename__ = "hgnc_rgd" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="rgds") def __str__(self): @@ -300,12 +295,12 @@ def __str__(self): class RefSeq(Base): """Class definition for the hgnc_refseq table.""" - __tablename__ = 'hgnc_refseq' + __tablename__ = "hgnc_refseq" id = Column(Integer, primary_key=True) accession = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="refseqs") def __str__(self): @@ -315,12 +310,12 @@ def __str__(self): class PubMed(Base): """Class definition for the hgnc_pubmed table.""" - __tablename__ = 'hgnc_pubmed' + __tablename__ = "hgnc_pubmed" id = Column(Integer, primary_key=True) pmid = Column(Integer, index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="pubmeds") def __str__(self): @@ -330,12 +325,12 @@ def __str__(self): class PrevName(Base): """Class definition for the hgnc_prev_name table.""" - __tablename__ = 'hgnc_prev_name' + __tablename__ = "hgnc_prev_name" id = Column(Integer, primary_key=True) prev_name = Column(String(255)) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="prev_names") def __str__(self): @@ -345,12 +340,12 @@ def __str__(self): class Omim(Base): """Class definition for the hgnc_omim table.""" - __tablename__ = 'hgnc_omim' + __tablename__ = "hgnc_omim" id = Column(Integer, primary_key=True) identifier = Column(Integer, index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="omims") def __str__(self): @@ -360,12 +355,12 @@ def __str__(self): class Mgd(Base): """Class definition for the hgnc_mgd table.""" - __tablename__ = 'hgnc_mgd' + __tablename__ = "hgnc_mgd" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="mgds") def __str__(self): @@ -375,12 +370,12 @@ def __str__(self): class Lsdb(Base): """Class definition for the hgnc_lsdb table.""" - __tablename__ = 'hgnc_lsdb' + __tablename__ = "hgnc_lsdb" id = Column(Integer, primary_key=True) identifier = Column(Text) - hgnc_id = Column(Integer, ForeignKey('hgnc.id')) + hgnc_id = Column(Integer, ForeignKey("hgnc.id")) hgnc = relationship("Hgnc", back_populates="lsdbs") def __str__(self): diff --git a/ebel/manager/rdbms/models/human_ortholog.py b/ebel/manager/rdbms/models/human_ortholog.py index 73690da..a1ccd37 100644 --- a/ebel/manager/rdbms/models/human_ortholog.py +++ b/ebel/manager/rdbms/models/human_ortholog.py @@ -10,7 +10,7 @@ class HumanOrtholog(Base): """Class definition for the human_ortholog table.""" - __tablename__ = 'human_ortholog' + __tablename__ = "human_ortholog" id = Column(Integer, primary_key=True) diff --git a/ebel/manager/rdbms/models/intact.py b/ebel/manager/rdbms/models/intact.py index 372e046..ab5ac33 100644 --- a/ebel/manager/rdbms/models/intact.py +++ b/ebel/manager/rdbms/models/intact.py @@ -1,6 +1,6 @@ """IntAct RDBMS model definition.""" +from sqlalchemy import Column, Float, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Float, Text from ebel.manager.rdbms.models import object_as_dict @@ -10,7 +10,7 @@ class Intact(Base): """Class definition for the intact table.""" - __tablename__ = 'intact' + __tablename__ = "intact" id = Column(Integer, primary_key=True) confidence_value = Column(Float, index=True) detection_method = Column(String(100), index=True) diff --git a/ebel/manager/rdbms/models/iuphar.py b/ebel/manager/rdbms/models/iuphar.py index 7723bb9..790a929 100644 --- a/ebel/manager/rdbms/models/iuphar.py +++ b/ebel/manager/rdbms/models/iuphar.py @@ -1,7 +1,8 @@ """IUPHAR RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import (BigInteger, Boolean, Column, ForeignKey, Integer, + Numeric, String, Text) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Boolean, Text, ForeignKey, BigInteger, Numeric +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict @@ -11,7 +12,7 @@ class IupharLigand(Base): """Class definition for the iuphar_ligand table.""" - __tablename__ = 'iuphar_ligand' + __tablename__ = "iuphar_ligand" id = Column(Integer, primary_key=True) name = Column(Text) @@ -24,7 +25,7 @@ class IupharLigand(Base): pubchem_sid = Column(BigInteger) pubchem_cid = Column(Text) # TODO: This is a integer, but for import reasons this changed to text uniprot_id = Column(Text) - ensembl_id = Column(BigInteger) + ensembl_id = Column(Text) ligand_subunit_ids = Column(Text) ligand_subunit_name = Column(Text) ligand_subunit_uni_prot_ids = Column(Text) @@ -49,7 +50,7 @@ def as_dict(self): class IupharInteraction(Base): """Class definition for the iuphar_interaction table.""" - __tablename__ = 'iuphar_interaction' + __tablename__ = "iuphar_interaction" id = Column(Integer, primary_key=True) target = Column(String(255)) @@ -67,11 +68,12 @@ class IupharInteraction(Base): target_ligand_pubchem_sid = Column(Integer) target_species = Column(String(100)) ligand = Column(String(255)) - ligand_id = Column(Integer, ForeignKey('iuphar_ligand.id'), index=True) + ligand_id = Column(Integer, ForeignKey("iuphar_ligand.id"), index=True) ligand_subunit_ids = Column(Text) ligand_gene_symbol = Column(String(50)) ligand_species = Column(String(50)) ligand_pubchem_sid = Column(Integer) + ligand_type = Column(Text) approved = Column(Boolean) type = Column(String(100)) action = Column(String(100)) diff --git a/ebel/manager/rdbms/models/kegg.py b/ebel/manager/rdbms/models/kegg.py index 56fd615..c5b07ee 100644 --- a/ebel/manager/rdbms/models/kegg.py +++ b/ebel/manager/rdbms/models/kegg.py @@ -10,7 +10,7 @@ class Kegg(Base): """Class definition for the kegg table.""" - __tablename__ = 'kegg' + __tablename__ = "kegg" id = Column(Integer, primary_key=True) pathway_identifier = Column(String(100)) diff --git a/ebel/manager/rdbms/models/mirtarbase.py b/ebel/manager/rdbms/models/mirtarbase.py index b8e905c..6f5014e 100644 --- a/ebel/manager/rdbms/models/mirtarbase.py +++ b/ebel/manager/rdbms/models/mirtarbase.py @@ -10,7 +10,7 @@ class Mirtarbase(Base): """Class definition for the mirtarbase table.""" - __tablename__ = 'mirtarbase' + __tablename__ = "mirtarbase" id = Column(Integer, primary_key=True) mi_rtar_base_id = Column(String(20)) diff --git a/ebel/manager/rdbms/models/ncbi.py b/ebel/manager/rdbms/models/ncbi.py index 1fa432e..59a56f6 100644 --- a/ebel/manager/rdbms/models/ncbi.py +++ b/ebel/manager/rdbms/models/ncbi.py @@ -1,7 +1,7 @@ """NCBI RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import Column, ForeignKey, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Text, ForeignKey +from sqlalchemy.orm import relationship from . import object_as_dict @@ -11,7 +11,7 @@ class NcbiGeneInfo(Base): """Class definition for the ncbi_gene_info table.""" - __tablename__ = 'ncbi_gene_info' + __tablename__ = "ncbi_gene_info" gene_id = Column(Integer, primary_key=True) tax_id = Column(Integer, index=True) @@ -20,37 +20,46 @@ class NcbiGeneInfo(Base): locus_tag = Column(String(100)) chromosome = Column(String(100)) map_location = Column(String(100)) - description_id = Column(Integer, ForeignKey('ncbi_gene_info_description.id')) + description_id = Column(Integer, ForeignKey("ncbi_gene_info_description.id")) description = relationship("NcbiGeneInfoDescription", foreign_keys=[description_id]) xrefs = relationship("NcbiGeneInfoXref", back_populates="gene") - mims = relationship("NcbiGeneMim", foreign_keys='NcbiGeneMim.gene_id', back_populates="gene") - orthologs = relationship("NcbiGeneOrtholog", foreign_keys='NcbiGeneOrtholog.gene_id', back_populates="gene") + mims = relationship("NcbiGeneMim", foreign_keys="NcbiGeneMim.gene_id", back_populates="gene") + orthologs = relationship( + "NcbiGeneOrtholog", + foreign_keys="NcbiGeneOrtholog.gene_id", + back_populates="gene", + ) ensembl_ids = relationship("NcbiGeneEnsembl", back_populates="genes") - gene_ids_right = relationship("NcbiGeneOnRight", foreign_keys='NcbiGeneOnRight.gene_id', back_populates="gene") - gene_ids_left = relationship("NcbiGeneOnLeft", foreign_keys='NcbiGeneOnLeft.gene_id', back_populates="gene") - gene_ids_overlapping = relationship("NcbiGeneOverlapping", foreign_keys='NcbiGeneOverlapping.gene_id', - back_populates="gene") + gene_ids_right = relationship("NcbiGeneOnRight", foreign_keys="NcbiGeneOnRight.gene_id", back_populates="gene") + gene_ids_left = relationship("NcbiGeneOnLeft", foreign_keys="NcbiGeneOnLeft.gene_id", back_populates="gene") + gene_ids_overlapping = relationship( + "NcbiGeneOverlapping", + foreign_keys="NcbiGeneOverlapping.gene_id", + back_populates="gene", + ) def as_dict(self): """Convert object values to dictionary.""" - rdict = object_as_dict(self, ['description_id']) - rdict.update({ - 'xrefs': [{'db': x.db, 'dbid': x.dbid} for x in self.xrefs], - 'mims': [x.mim_number for x in self.mims], - 'description': self.description.description, - 'ensembls': [x.ensembl_gene_identifier for x in self.ensembl_ids], - 'orthologs': [x.other_gene_id for x in self.orthologs], - 'gene_ids_right': [x.gene_id_on_right for x in self.gene_ids_right], - 'gene_ids_left': [x.gene_id_on_left for x in self.gene_ids_left], - 'gene_ids_overlapping': [x.overlapping_gene_id for x in self.gene_ids_overlapping] - }) + rdict = object_as_dict(self, ["description_id"]) + rdict.update( + { + "xrefs": [{"db": x.db, "dbid": x.dbid} for x in self.xrefs], + "mims": [x.mim_number for x in self.mims], + "description": self.description.description, + "ensembls": [x.ensembl_gene_identifier for x in self.ensembl_ids], + "orthologs": [x.other_gene_id for x in self.orthologs], + "gene_ids_right": [x.gene_id_on_right for x in self.gene_ids_right], + "gene_ids_left": [x.gene_id_on_left for x in self.gene_ids_left], + "gene_ids_overlapping": [x.overlapping_gene_id for x in self.gene_ids_overlapping], + } + ) return rdict class NcbiGeneInfoDescription(Base): """Class definition for the ncbi_gene_info_description table.""" - __tablename__ = 'ncbi_gene_info_description' + __tablename__ = "ncbi_gene_info_description" id = Column(Integer, primary_key=True, autoincrement=True) description = Column(Text) @@ -58,10 +67,10 @@ class NcbiGeneInfoDescription(Base): class NcbiGeneOnRight(Base): """Class definition for the ncbi_gene_on_right table.""" - __tablename__ = 'ncbi_gene_on_right' + __tablename__ = "ncbi_gene_on_right" id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) - gene_id_on_right = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + gene_id_on_right = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) @@ -69,10 +78,10 @@ class NcbiGeneOnRight(Base): class NcbiGeneOnLeft(Base): """Class definition for the ncbi_gene_on_left table.""" - __tablename__ = 'ncbi_gene_on_left' + __tablename__ = "ncbi_gene_on_left" id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) - gene_id_on_left = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + gene_id_on_left = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) @@ -80,10 +89,10 @@ class NcbiGeneOnLeft(Base): class NcbiGeneOverlapping(Base): """Class definition for the ncbi_gene_overlapping table.""" - __tablename__ = 'ncbi_gene_overlapping' + __tablename__ = "ncbi_gene_overlapping" id = Column(Integer, primary_key=True, autoincrement=True) - gene_id = gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) - overlapping_gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) + overlapping_gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) @@ -91,12 +100,12 @@ class NcbiGeneOverlapping(Base): class NcbiGeneOrtholog(Base): """Class definition for the ncbi_gene_ortholog table.""" - __tablename__ = 'ncbi_gene_ortholog' + __tablename__ = "ncbi_gene_ortholog" id = Column(Integer, primary_key=True, autoincrement=True) tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) other_tax_id = Column(Integer, index=True) - other_gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + other_gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", foreign_keys=[gene_id]) @@ -104,11 +113,11 @@ class NcbiGeneOrtholog(Base): class NcbiGenePubmed(Base): """Class definition for the ncbi_gene_pubmed table.""" - __tablename__ = 'ncbi_gene_pubmed' + __tablename__ = "ncbi_gene_pubmed" id = Column(Integer, primary_key=True) tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) pub_med_id = Column(Integer) @@ -120,7 +129,7 @@ class NcbiGeneInfoXref(Base): db = Column(String(100), index=True) dbid = Column(String(100), index=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) gene = relationship("NcbiGeneInfo", back_populates="xrefs") @@ -128,11 +137,11 @@ class NcbiGeneInfoXref(Base): class NcbiGeneMim(Base): """Class definition for the ncbi_gene_mim table.""" - __tablename__ = 'ncbi_gene_mim' + __tablename__ = "ncbi_gene_mim" id = Column(Integer, primary_key=True, autoincrement=True) mim_number = Column(Integer) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) type = Column(String(100)) source = Column(String(100)) med_gen_cui = Column(String(100), index=True) @@ -148,18 +157,18 @@ def as_dict(self): "type": self.type, "source": self.source, "med_gen_cui": self.med_gen_cui, - "comment": self.comment + "comment": self.comment, } class NcbiGeneEnsembl(Base): """Class definition for the ncbi_gene_ensembl table.""" - __tablename__ = 'ncbi_gene_ensembl' + __tablename__ = "ncbi_gene_ensembl" id = Column(Integer, primary_key=True, autoincrement=True) tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) ensembl_gene_identifier = Column(String(100)) rna_nucleotide_accession_version = Column(String(100)) ensembl_rna_identifier = Column(String(100)) @@ -177,18 +186,18 @@ def as_dict(self): "rna_nucleotide_accession_version": self.rna_nucleotide_accession_version, "ensembl_rna_identifier": self.ensembl_rna_identifier, "protein_accession_version": self.protein_accession_version, - "ensembl_protein_identifier": self.ensembl_protein_identifier + "ensembl_protein_identifier": self.ensembl_protein_identifier, } class NcbiGeneGo(Base): """Class definition for the ncbi_gene_go table.""" - __tablename__ = 'ncbi_gene_go' + __tablename__ = "ncbi_gene_go" id = Column(Integer, primary_key=True, autoincrement=True) tax_id = Column(Integer, index=True) - gene_id = Column(Integer, ForeignKey('ncbi_gene_info.gene_id')) + gene_id = Column(Integer, ForeignKey("ncbi_gene_info.gene_id")) go_id = Column(String(100), index=True) evidence = Column(String(10)) qualifier = Column(String(100)) @@ -200,24 +209,24 @@ class NcbiGeneGo(Base): def as_dict(self): """Convert object values to dictionary.""" return { - 'tax_id': self.tax_id, - 'gene_id': self.gene_id, - 'go_id': self.go_id, - 'evidence': self.evidence, - 'qualifier': self.qualifier, - 'go_term': self.go_term, - 'pmids': [x.pmid for x in self.pmids], - 'category': self.category + "tax_id": self.tax_id, + "gene_id": self.gene_id, + "go_id": self.go_id, + "evidence": self.evidence, + "qualifier": self.qualifier, + "go_term": self.go_term, + "pmids": [x.pmid for x in self.pmids], + "category": self.category, } class NcbiGeneGoPmid(Base): """Class definition for the ncbi_gene_go_pmid table.""" - __tablename__ = 'ncbi_gene_go_pmid' + __tablename__ = "ncbi_gene_go_pmid" id = Column(Integer, primary_key=True, autoincrement=True) - ncbi_gene_go_id = Column(Integer, ForeignKey('ncbi_gene_go.id')) + ncbi_gene_go_id = Column(Integer, ForeignKey("ncbi_gene_go.id")) pmid = Column(Integer) gos = relationship("NcbiGeneGo", back_populates="pmids") @@ -226,7 +235,7 @@ class NcbiGeneGoPmid(Base): class NcbiMedGenName(Base): """Class definition for the ncbi_medgen_name table.""" - __tablename__ = 'ncbi_medgen_name' + __tablename__ = "ncbi_medgen_name" id = Column(Integer, primary_key=True, autoincrement=True) cui = Column(String(100)) @@ -237,23 +246,23 @@ class NcbiMedGenName(Base): def as_dict(self): """Convert object values to dictionary.""" - rdit = object_as_dict(self, exclude=['id']) - rdit.update({'pmids': [x.pmid for x in self.pmids]}) + rdit = object_as_dict(self, exclude=["id"]) + rdit.update({"pmids": [x.pmid for x in self.pmids]}) return rdit class NcbiMedGenPmid(Base): """Class definition for the ncbi_medgen_pmid table.""" - __tablename__ = 'ncbi_medgen_pmid' + __tablename__ = "ncbi_medgen_pmid" id = Column(Integer, primary_key=True, autoincrement=True) - ncbi_medgen_name_id = Column(Integer, ForeignKey('ncbi_medgen_name.id')) + ncbi_medgen_name_id = Column(Integer, ForeignKey("ncbi_medgen_name.id")) pmid = Column(Integer, index=True) med_gen_name = relationship("NcbiMedGenName", back_populates="pmids") def as_dict(self): """Convert object values to dictionary.""" - rdit = object_as_dict(self, exclude=['id']) + rdit = object_as_dict(self, exclude=["id"]) return rdit diff --git a/ebel/manager/rdbms/models/nsides.py b/ebel/manager/rdbms/models/nsides.py index 1affcf5..ef5da64 100644 --- a/ebel/manager/rdbms/models/nsides.py +++ b/ebel/manager/rdbms/models/nsides.py @@ -1,6 +1,6 @@ """NSIDES RDBMS model definition.""" -from sqlalchemy import Column, Integer, String, Float +from sqlalchemy import Column, Float, Index, Integer, String from sqlalchemy.ext.declarative import declarative_base from ebel.manager.rdbms.models import object_as_dict @@ -11,7 +11,16 @@ class Nsides(Base): """Class definition for the nSIDES table.""" - __tablename__ = 'nsides' + __tablename__ = "nsides" + __table_args__ = ( + Index( + "idx_nsides_multi", + "condition_meddra_id", + "condition_concept_name", + "prr", + "mean_reporting_frequency", + ), + ) id = Column(Integer, primary_key=True) drug_rxnorn_id = Column(String(20), index=True) # This has to be a String because of mapping to drugbank ids @@ -22,10 +31,6 @@ class Nsides(Base): condition_meddra_id = Column(Integer) condition_concept_name = Column(String(255), index=True) - # OnSIDES specific - vocabulary_id = Column(String(10)) - domain_id = Column(String(10)) - # OFFSIDES specific a = Column(Integer) b = Column(Integer) diff --git a/ebel/manager/rdbms/models/pathway_commons.py b/ebel/manager/rdbms/models/pathway_commons.py index 9964fbf..5478aed 100644 --- a/ebel/manager/rdbms/models/pathway_commons.py +++ b/ebel/manager/rdbms/models/pathway_commons.py @@ -1,30 +1,41 @@ """Pathway Commons RDBMS model definition.""" -from sqlalchemy.orm import relationship +from sqlalchemy import BigInteger, Column, ForeignKey, Integer, String, Table from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, BigInteger, Table, ForeignKey +from sqlalchemy.orm import relationship from ebel.manager.rdbms.models import object_as_dict Base = declarative_base() -pathway_commons__pathway_name = Table('pathway_commons__pathway_name', Base.metadata, - Column('pathway_commons_id', Integer, ForeignKey('pathway_commons.id'), - index=True), - Column('pathway_commons_pathway_name_id', Integer, - ForeignKey('pathway_commons_pathway_name.id'), index=True) - ) - -pathway_commons__source = Table('pathway_commons__source', Base.metadata, - Column('pathway_commons_id', Integer, ForeignKey('pathway_commons.id'), index=True), - Column('pathway_commons_source_id', Integer, - ForeignKey('pathway_commons_source.id'), index=True) - ) +pathway_commons__pathway_name = Table( + "pathway_commons__pathway_name", + Base.metadata, + Column("pathway_commons_id", Integer, ForeignKey("pathway_commons.id"), index=True), + Column( + "pathway_commons_pathway_name_id", + Integer, + ForeignKey("pathway_commons_pathway_name.id"), + index=True, + ), +) + +pathway_commons__source = Table( + "pathway_commons__source", + Base.metadata, + Column("pathway_commons_id", Integer, ForeignKey("pathway_commons.id"), index=True), + Column( + "pathway_commons_source_id", + Integer, + ForeignKey("pathway_commons_source.id"), + index=True, + ), +) class PathwayCommons(Base): """Class definition for the pathway_commons table.""" - __tablename__ = 'pathway_commons' + __tablename__ = "pathway_commons" id = Column(Integer, primary_key=True) participant_a = Column(String(50), index=True) @@ -36,14 +47,10 @@ class PathwayCommons(Base): pathway_names = relationship( "PathwayName", secondary=pathway_commons__pathway_name, - back_populates="pathway_commonses" + back_populates="pathway_commonses", ) - sources = relationship( - "Source", - secondary=pathway_commons__source, - back_populates="pathway_commonses" - ) + sources = relationship("Source", secondary=pathway_commons__source, back_populates="pathway_commonses") def __str__(self): return f"{self.participant_a} {self.interaction_type} {self.participant_b}" @@ -51,16 +58,16 @@ def __str__(self): def as_dict(self): """Convert object values to dictionary.""" pathway_commons = object_as_dict(self) - pathway_commons.update({'pmids': [x.pmid for x in self.pmids]}) - pathway_commons.update({'pathway_names': [x.name for x in self.pathway_names]}) - pathway_commons.update({'sources': [x.source for x in self.sources]}) + pathway_commons.update({"pmids": [x.pmid for x in self.pmids]}) + pathway_commons.update({"pathway_names": [x.name for x in self.pathway_names]}) + pathway_commons.update({"sources": [x.source for x in self.sources]}) return pathway_commons class PathwayName(Base): """Class definition for the pathway_commons_pathway_name table.""" - __tablename__ = 'pathway_commons_pathway_name' + __tablename__ = "pathway_commons_pathway_name" id = Column(Integer, primary_key=True) name = Column(String(255), index=True) @@ -68,7 +75,8 @@ class PathwayName(Base): pathway_commonses = relationship( "PathwayCommons", secondary=pathway_commons__pathway_name, - back_populates="pathway_names") + back_populates="pathway_names", + ) def __str__(self): """Class string definition.""" @@ -78,12 +86,12 @@ def __str__(self): class Pmid(Base): """Class definition for the pathway_commons_pmid table.""" - __tablename__ = 'pathway_commons_pmid' + __tablename__ = "pathway_commons_pmid" id = Column(Integer, primary_key=True) pmid = Column(BigInteger, index=True) - pathway_commons_id = Column(Integer, ForeignKey('pathway_commons.id'), index=True) + pathway_commons_id = Column(Integer, ForeignKey("pathway_commons.id"), index=True) pathway_commons = relationship("PathwayCommons", back_populates="pmids") def __str__(self): @@ -94,15 +102,12 @@ def __str__(self): class Source(Base): """Class definition for the pathway_commons_source table.""" - __tablename__ = 'pathway_commons_source' + __tablename__ = "pathway_commons_source" id = Column(Integer, primary_key=True) source = Column(String(50)) - pathway_commonses = relationship( - "PathwayCommons", - secondary=pathway_commons__source, - back_populates="sources") + pathway_commonses = relationship("PathwayCommons", secondary=pathway_commons__source, back_populates="sources") def __str__(self): """Class string definition.""" diff --git a/ebel/manager/rdbms/models/protein_atlas.py b/ebel/manager/rdbms/models/protein_atlas.py index 1a7f97d..167a33a 100644 --- a/ebel/manager/rdbms/models/protein_atlas.py +++ b/ebel/manager/rdbms/models/protein_atlas.py @@ -1,6 +1,6 @@ """Protein Atlas RDBMS model definition.""" +from sqlalchemy import Column, Integer, Numeric, String, Text from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Text, Numeric Base = declarative_base() @@ -8,7 +8,7 @@ class ProteinAtlasNormalTissue(Base): """Class definition for the protein_atlas_normal_tissue table.""" - __tablename__ = 'protein_atlas_normal_tissue' + __tablename__ = "protein_atlas_normal_tissue" id = Column(Integer, primary_key=True) gene = Column(String(100), index=True) @@ -20,18 +20,20 @@ class ProteinAtlasNormalTissue(Base): def as_dict(self): """Convert object values to dictionary.""" - return {'gene': self.gene, - 'gene_name': self.gene_name, - 'tissue': self.tissue, - 'cell_type': self.cell_type, - 'level': self.level, - 'reliability': self.reliability} + return { + "gene": self.gene, + "gene_name": self.gene_name, + "tissue": self.tissue, + "cell_type": self.cell_type, + "level": self.level, + "reliability": self.reliability, + } class ProteinAtlasSubcellularLocation(Base): """Class definition for the protein_atlas_subcellular_location table.""" - __tablename__ = 'protein_atlas_subcellular_location' + __tablename__ = "protein_atlas_subcellular_location" id = Column(Integer, primary_key=True) gene = Column(String(100)) @@ -51,26 +53,28 @@ class ProteinAtlasSubcellularLocation(Base): def as_dict(self): """Convert object values to dictionary.""" - return {'gene': self.gene, - 'gene_name': self.gene_name, - 'reliability': self.reliability, - 'main_location': self.main_location, - 'additional_location': self.additional_location, - 'extracellular_location': self.extracellular_location, - 'enhanced': self.enhanced, - 'supported': self.supported, - 'approved': self.approved, - 'uncertain': self.uncertain, - 'single_cell_variation_intensity': self.single_cell_variation_intensity, - 'single_cell_variation_spatial': self.single_cell_variation_spatial, - 'cell_cycle_dependency': self.cell_cycle_dependency, - 'go_id': self.go_id} + return { + "gene": self.gene, + "gene_name": self.gene_name, + "reliability": self.reliability, + "main_location": self.main_location, + "additional_location": self.additional_location, + "extracellular_location": self.extracellular_location, + "enhanced": self.enhanced, + "supported": self.supported, + "approved": self.approved, + "uncertain": self.uncertain, + "single_cell_variation_intensity": self.single_cell_variation_intensity, + "single_cell_variation_spatial": self.single_cell_variation_spatial, + "cell_cycle_dependency": self.cell_cycle_dependency, + "go_id": self.go_id, + } class ProteinAtlasRnaTissueConsensus(Base): """Class definition for the protein_atlas_rna_tissue_consensus table.""" - __tablename__ = 'protein_atlas_rna_tissue_consensus' + __tablename__ = "protein_atlas_rna_tissue_consensus" id = Column(Integer, primary_key=True) gene = Column(String(100), index=True) @@ -80,16 +84,18 @@ class ProteinAtlasRnaTissueConsensus(Base): def as_dict(self): """Convert object values to dictionary.""" - return {'gene': self.gene, - 'gene_name': self.gene_name, - 'tissue': self.tissue, - 'n_tpm': self.nx} + return { + "gene": self.gene, + "gene_name": self.gene_name, + "tissue": self.tissue, + "n_tpm": self.nx, + } class ProteinAtlasRnaBrainGtex(Base): """Class definition for the protein_atlas_rna_brain_gtex table.""" - __tablename__ = 'protein_atlas_rna_brain_gtex' + __tablename__ = "protein_atlas_rna_brain_gtex" id = Column(Integer, primary_key=True) gene = Column(String(100), index=True) @@ -102,19 +108,19 @@ class ProteinAtlasRnaBrainGtex(Base): def as_dict(self): """Convert object values to dictionary.""" return { - 'gene': self.gene, - 'gene_name': self.gene_name, - 'brain_region': self.brain_region, - 'tpm': self.tpm, - 'p_tpm': self.p_tpm, - 'n_tpm': self.nx + "gene": self.gene, + "gene_name": self.gene_name, + "brain_region": self.brain_region, + "tpm": self.tpm, + "p_tpm": self.p_tpm, + "n_tpm": self.nx, } class ProteinAtlasRnaBrainFantom(Base): """Class definition for the protein_atlas_rna_brain_fantom table.""" - __tablename__ = 'protein_atlas_rna_brain_fantom' + __tablename__ = "protein_atlas_rna_brain_fantom" id = Column(Integer, primary_key=True) gene = Column(String(100)) @@ -126,18 +132,20 @@ class ProteinAtlasRnaBrainFantom(Base): def as_dict(self): """Convert object values to dictionary.""" - return {'gene': self.gene, - 'gene_name': self.gene_name, - 'brain_region': self.brain_region, - 'tags_per_million': self.tags_per_million, - 'scaled_tags_per_million': self.scaled_tags_per_million, - 'n_tpm': self.nx} + return { + "gene": self.gene, + "gene_name": self.gene_name, + "brain_region": self.brain_region, + "tags_per_million": self.tags_per_million, + "scaled_tags_per_million": self.scaled_tags_per_million, + "n_tpm": self.nx, + } class ProteinAtlasRnaMouseBrainAllen(Base): """Class definition for the protein_atlas_rna_mouse_brain_allen table.""" - __tablename__ = 'protein_atlas_rna_mouse_brain_allen' + __tablename__ = "protein_atlas_rna_mouse_brain_allen" id = Column(Integer, primary_key=True) gene = Column(String(100)) @@ -148,8 +156,8 @@ class ProteinAtlasRnaMouseBrainAllen(Base): def as_dict(self): """Convert object values to dictionary.""" return { - 'gene': self.gene, - 'gene_name': self.gene_name, - 'brain_region': self.brain_region, - 'expression_energy': self.expression_energy + "gene": self.gene, + "gene_name": self.gene_name, + "brain_region": self.brain_region, + "expression_energy": self.expression_energy, } diff --git a/ebel/manager/rdbms/models/reactome.py b/ebel/manager/rdbms/models/reactome.py index d29a565..0624899 100644 --- a/ebel/manager/rdbms/models/reactome.py +++ b/ebel/manager/rdbms/models/reactome.py @@ -10,7 +10,7 @@ class Reactome(Base): """Class definition for the reactome table.""" - __tablename__ = 'reactome' + __tablename__ = "reactome" id = Column(Integer, primary_key=True) identifier = Column(String(50), index=True) uniprot_accession = Column(String(50), index=True) diff --git a/ebel/manager/rdbms/models/stringdb.py b/ebel/manager/rdbms/models/stringdb.py index b7d1fb1..31842a5 100644 --- a/ebel/manager/rdbms/models/stringdb.py +++ b/ebel/manager/rdbms/models/stringdb.py @@ -1,7 +1,7 @@ """StringDB RDBMS model definition.""" +from sqlalchemy import Boolean, Column, Integer, SmallInteger, String from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, SmallInteger, Boolean from ebel.manager.rdbms.models import object_as_dict @@ -11,7 +11,7 @@ class StringDb(Base): """Class definition for the stringdb table.""" - __tablename__ = 'stringdb' + __tablename__ = "stringdb" id = Column(Integer, primary_key=True) @@ -36,27 +36,27 @@ class StringDb(Base): def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class StringDbProtein(Base): """Class definition for the stringdb_protein table.""" - __tablename__ = 'stringdb_protein' + __tablename__ = "stringdb_protein" id = Column(Integer, primary_key=True) - protein_external_id = Column(String(50), nullable=False, index=True) + string_protein_id = Column(String(50), nullable=False, index=True) preferred_name = Column(String(50), nullable=False, index=True) def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) class StringDbAction(Base): """Class definition for the stringdb_action table.""" - __tablename__ = 'stringdb_action' + __tablename__ = "stringdb_action" id = Column(Integer, primary_key=True) item_id_a = Column(String(50), nullable=False) item_id_b = Column(String(50), nullable=False) @@ -70,4 +70,4 @@ class StringDbAction(Base): def as_dict(self): """Convert object values to dictionary.""" - return object_as_dict(self, exclude=['id']) + return object_as_dict(self, exclude=["id"]) diff --git a/ebel/manager/rdbms/models/uniprot.py b/ebel/manager/rdbms/models/uniprot.py index 7a848f5..39a86f3 100644 --- a/ebel/manager/rdbms/models/uniprot.py +++ b/ebel/manager/rdbms/models/uniprot.py @@ -1,39 +1,49 @@ """UniProt RDBMS model definition.""" -from sqlalchemy.orm import relationship -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Table, Text, ForeignKey - from collections import defaultdict -Base = declarative_base() - -uniprot__uniprot_keyword = Table('uniprot__uniprot_keyword', Base.metadata, - Column('uniprot_id', Integer, ForeignKey('uniprot.id')), - Column('uniprot_keyword_id', Integer, - ForeignKey('uniprot_keyword.keywordid')) - ) - -uniprot__uniprot_host = Table('uniprot__uniprot_host', Base.metadata, - Column('uniprot_id', Integer, ForeignKey('uniprot.id')), - Column('uniprot_organism_id', Integer, ForeignKey('uniprot_organism.taxid')) - ) +from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship -uniprot__uniprot_xref = Table('uniprot__uniprot_xref', Base.metadata, - Column('uniprot_id', Integer, ForeignKey('uniprot.id')), - Column('uniprot_xref_id', Integer, ForeignKey('uniprot_xref.id')) - ) +Base = declarative_base() -uniprot__uniprot_subcellular_location = Table('uniprot__uniprot_subcellular_location', Base.metadata, - Column('uniprot_id', Integer, ForeignKey('uniprot.id')), - Column('uniprot_subcellular_location_id', - Integer, ForeignKey('uniprot_subcellular_location.id')) - ) +uniprot__uniprot_keyword = Table( + "uniprot__uniprot_keyword", + Base.metadata, + Column("uniprot_id", Integer, ForeignKey("uniprot.id")), + Column("uniprot_keyword_id", Integer, ForeignKey("uniprot_keyword.keywordid")), +) + +uniprot__uniprot_host = Table( + "uniprot__uniprot_host", + Base.metadata, + Column("uniprot_id", Integer, ForeignKey("uniprot.id")), + Column("uniprot_organism_id", Integer, ForeignKey("uniprot_organism.taxid")), +) + +uniprot__uniprot_xref = Table( + "uniprot__uniprot_xref", + Base.metadata, + Column("uniprot_id", Integer, ForeignKey("uniprot.id")), + Column("uniprot_xref_id", Integer, ForeignKey("uniprot_xref.id")), +) + +uniprot__uniprot_subcellular_location = Table( + "uniprot__uniprot_subcellular_location", + Base.metadata, + Column("uniprot_id", Integer, ForeignKey("uniprot.id")), + Column( + "uniprot_subcellular_location_id", + Integer, + ForeignKey("uniprot_subcellular_location.id"), + ), +) class Uniprot(Base): """Class definition for the UniProt table.""" - __tablename__ = 'uniprot' + __tablename__ = "uniprot" id = Column(Integer, primary_key=True) @@ -41,37 +51,26 @@ class Uniprot(Base): name = Column(String(100), nullable=False, unique=True) recommended_name = Column(String(255), nullable=True) - taxid = Column(Integer, ForeignKey('uniprot_organism.taxid'), nullable=False, index=True) + taxid = Column(Integer, ForeignKey("uniprot_organism.taxid"), nullable=False, index=True) organism = relationship("Organism") - function_id = Column(Integer, ForeignKey('uniprot_function.id'), nullable=True) + function_id = Column(Integer, ForeignKey("uniprot_function.id"), nullable=True) function = relationship("Function") gene_names = relationship("Gene", back_populates="uniprot") gene_symbol = relationship("GeneSymbol", uselist=False, back_populates="uniprot") - keywords = relationship( - "Keyword", - secondary=uniprot__uniprot_keyword, - back_populates="uniprots") + keywords = relationship("Keyword", secondary=uniprot__uniprot_keyword, back_populates="uniprots") - hosts = relationship( - "Organism", - secondary=uniprot__uniprot_host, - back_populates="uniprots" - ) + hosts = relationship("Organism", secondary=uniprot__uniprot_host, back_populates="uniprots") - xrefs = relationship( - "Xref", - secondary=uniprot__uniprot_xref, - back_populates="uniprots" - ) + xrefs = relationship("Xref", secondary=uniprot__uniprot_xref, back_populates="uniprots") subcellular_locations = relationship( "SubcellularLocation", secondary=uniprot__uniprot_subcellular_location, - back_populates="uniprots" + back_populates="uniprots", ) def __repr__(self): @@ -85,28 +84,28 @@ def as_dict(self): xrefs_grouped = {k: sorted(v) for k, v in xrefs_grouped.items()} return { - 'name': self.name, - 'accession': self.accession, - 'recommended_name': self.recommended_name, - 'taxid': self.taxid, - 'function_description': self.function.description if self.function else self.function, - 'gene_names': [x.name for x in self.gene_names], - 'gene_symbol': self.gene_symbol.symbol if self.gene_symbol else self.gene_symbol, - 'keywords': [{'keyword': x.keyword_name, 'id': x.keywordid} for x in self.keywords], - 'hosts': [{'name': x.scientific_name, 'taxid': x.taxid} for x in self.hosts], - 'xrefs': xrefs_grouped, - 'subcellular_locations': [x.name for x in self.subcellular_locations], - 'organism': self.organism.scientific_name + "name": self.name, + "accession": self.accession, + "recommended_name": self.recommended_name, + "taxid": self.taxid, + "function_description": self.function.description if self.function else self.function, + "gene_names": [x.name for x in self.gene_names], + "gene_symbol": self.gene_symbol.symbol if self.gene_symbol else self.gene_symbol, + "keywords": [{"keyword": x.keyword_name, "id": x.keywordid} for x in self.keywords], + "hosts": [{"name": x.scientific_name, "taxid": x.taxid} for x in self.hosts], + "xrefs": xrefs_grouped, + "subcellular_locations": [x.name for x in self.subcellular_locations], + "organism": self.organism.scientific_name, } class GeneSymbol(Base): """Class definition for the uniprot_gene_symbol table.""" - __tablename__ = 'uniprot_gene_symbol' + __tablename__ = "uniprot_gene_symbol" id = Column(Integer, primary_key=True) symbol = Column(String(100), nullable=False, index=True) - uniprot_id = Column(Integer, ForeignKey('uniprot.id')) + uniprot_id = Column(Integer, ForeignKey("uniprot.id")) uniprot = relationship("Uniprot", back_populates="gene_symbol") def __repr__(self): @@ -117,25 +116,22 @@ def __repr__(self): class Gene(Base): """Class definition for the uniprot_gene table.""" - __tablename__ = 'uniprot_gene' + __tablename__ = "uniprot_gene" id = Column(Integer, primary_key=True) name = Column(String(100), nullable=False, index=True) - uniprot_id = Column(Integer, ForeignKey('uniprot.id')) + uniprot_id = Column(Integer, ForeignKey("uniprot.id")) uniprot = relationship("Uniprot", back_populates="gene_names") class Keyword(Base): """Class definition for the uniprot_keyword table.""" - __tablename__ = 'uniprot_keyword' + __tablename__ = "uniprot_keyword" keywordid = Column(Integer, primary_key=True) keyword_name = Column(String(100), index=True) - uniprots = relationship( - "Uniprot", - secondary=uniprot__uniprot_keyword, - back_populates="keywords") + uniprots = relationship("Uniprot", secondary=uniprot__uniprot_keyword, back_populates="keywords") def __repr__(self): """Define repr.""" @@ -145,21 +141,18 @@ def __repr__(self): class Organism(Base): """Class definition for the uniprot_organism table.""" - __tablename__ = 'uniprot_organism' + __tablename__ = "uniprot_organism" taxid = Column(Integer, primary_key=True) scientific_name = Column(String(255)) # TODO:Check if index=True with is possible - uniprots = relationship( - "Uniprot", - secondary=uniprot__uniprot_host, - back_populates="hosts") + uniprots = relationship("Uniprot", secondary=uniprot__uniprot_host, back_populates="hosts") class SubcellularLocation(Base): """Class definition for the uniprot_subcellular_location table.""" - __tablename__ = 'uniprot_subcellular_location' + __tablename__ = "uniprot_subcellular_location" id = Column(Integer, primary_key=True) @@ -168,29 +161,27 @@ class SubcellularLocation(Base): uniprots = relationship( "Uniprot", secondary=uniprot__uniprot_subcellular_location, - back_populates="subcellular_locations") + back_populates="subcellular_locations", + ) class Xref(Base): """Class definition for the uniprot_xref table.""" - __tablename__ = 'uniprot_xref' + __tablename__ = "uniprot_xref" id = Column(Integer, primary_key=True) db = Column(String(50), index=True) identifier = Column(String(100), index=True) - uniprots = relationship( - "Uniprot", - secondary=uniprot__uniprot_xref, - back_populates="xrefs") + uniprots = relationship("Uniprot", secondary=uniprot__uniprot_xref, back_populates="xrefs") class Function(Base): """Class definition for the uniprot_function table.""" - __tablename__ = 'uniprot_function' + __tablename__ = "uniprot_function" id = Column(Integer, primary_key=True) diff --git a/ebel/parser.py b/ebel/parser.py index d582e7d..9ac3ef2 100755 --- a/ebel/parser.py +++ b/ebel/parser.py @@ -1,109 +1,59 @@ """This module allows to parse BEL scripts.""" -import re -import json import codecs +import json import logging - +import re +from collections import OrderedDict, defaultdict from copy import copy +from typing import Any, Iterable, List + from lark import Lark -from lark.tree import Tree -from lark.lexer import Token -from pandas import DataFrame -from typing import List, Any, Iterable -from collections import defaultdict, OrderedDict from lark.exceptions import UnexpectedInput, UnexpectedToken +from pandas import DataFrame -from ebel.transformers import _BelTransformer +from ebel.constants import GRAMMAR_BEL_PATH, GRAMMAR_START_LINE from ebel.errors import BelSyntaxError - -from ebel.constants import GRAMMAR_START_LINE, GRAMMAR_BEL_PATH +from ebel.manager.models import load_grammar +from ebel.transformers import _BelTransformer # TODO: check all strings if they can be stored in constants # Following trees (rules in grammar) have one 1 value. This list is use in creation of json file -trees_with_one_value = ("document_name", "document_version", "document_description", "document_copyright", - "document_authors", "document_licences", "document_contact_info", "keyword", "anno_keyword", - "c_type", "c_title", "c_ref", "evidence", "frag_range", "frag_descriptor", "hgvs", - "gene_fusion_range", "rna_fusion_range", "protein_fusion_range", "document_keywords") +trees_with_one_value = ( + "document_name", + "document_version", + "document_description", + "document_copyright", + "document_authors", + "document_licences", + "document_contact_info", + "keyword", + "anno_keyword", + "c_type", + "c_title", + "c_ref", + "evidence", + "frag_range", + "frag_descriptor", + "hgvs", + "gene_fusion_range", + "rna_fusion_range", + "protein_fusion_range", + "document_keywords", +) # Exclude this token types if json of created -exclude_token_types = ('OB', 'CB', 'QM', 'COLON', 'COMMA', 'OCB', 'CCB') +exclude_token_types = ("OB", "CB", "QM", "COLON", "COMMA", "OCB", "CCB") logger = logging.getLogger(__name__) -def load_grammar(grammar_path): - """Return eBNF grammar in lark style. - - Parameters - ---------- - grammar_path : str - path to eBNF grammar in lark style. - - Returns - ------- - string - eBNF grammar in lark style. - - """ - # FIXME: something to do here - logger.info("load grammar {}".format(grammar_path)) - with codecs.open(grammar_path, 'r', encoding="utf-8") as fd_grammar: - grammar = fd_grammar.read() - fd_grammar.close() - return grammar - - -def first_token_value(tree: Tree, subtree_name: str) -> str: - """Get the first token value of Lark tree with subtree name. - - Parameters - ---------- - tree : type - Description of parameter `tree`. - subtree_name : type - Description of parameter `subtree_name`. - - Returns - ------- - type - Description of returned object. - - """ - # TODO: Get rid of this method by using a Transformer? Is this possible? - - for subtree in tree.iter_subtrees(): - if subtree.data == subtree_name: - return [node.value for node in subtree.children if isinstance(node, Token)][0] - - -def first_real_token_value(tokens: List[Token], purge: bool) -> str: - """Return value of first token not excluded by `exclude_token_types`. - - Parameters - ---------- - tokens: List[Token] - list of lark.lexer.Token - purge: bool - set if value will be purged. - - Returns - ------- - str - String value from first 'real' token. - """ - t = [token for token in tokens if token.type not in exclude_token_types][0] - if purge: - t.value = re.sub(r"\s{2,}", " ", t.value.replace("\\\n", "").strip()) - return t.value - - def camel_case(name: str) -> str: """Camel case a string.""" - first, *rest = name.split('_') - return first + ''.join(word.capitalize() for word in rest) + first, *rest = name.split("_") + return first + "".join(word.capitalize() for word in rest) def tupleit(lst: list) -> tuple: @@ -117,7 +67,6 @@ def to_tuple(py_obj: Any) -> tuple: # check if instance is dict, str, Iterable or something different if isinstance(py_obj, dict): - for k, v in py_obj.items(): lst.append((k, to_tuple(v))) @@ -148,7 +97,7 @@ def get_values(childs: list, num_expected_values=0) -> list: """ values = [x.value for x in childs if x.type not in exclude_token_types] if num_expected_values: - values = values + [''] * (num_expected_values - len(values)) + values = values + [""] * (num_expected_values - len(values)) return values @@ -283,32 +232,26 @@ def check_bel_script_line_by_line(bel_script_path: str, bel_version: str) -> Lis List[str] List of errors found in BEL script. """ - logger.info("Start syntax check for {} line by line with grammar BEL {}".format( - bel_script_path, bel_version)) + logger.info("Start syntax check for {} line by line with grammar BEL {}".format(bel_script_path, bel_version)) errors = [] grammar = load_grammar(GRAMMAR_BEL_PATH[str(bel_version)]) - parser = Lark(grammar, - start=GRAMMAR_START_LINE, - parser='lalr', - lexer="contextual") + parser = Lark(grammar, start=GRAMMAR_START_LINE, parser="lalr", lexer="contextual") cached_line = "" - with codecs.open(bel_script_path, 'r', encoding="utf-8") as fd: - + with codecs.open(bel_script_path, "r", encoding="utf-8") as fd: lines = fd.readlines() - if not re.search('(\n|\r|\r\n)$', lines[-1]): + if not re.search("(\n|\r|\r\n)$", lines[-1]): lines[-1] += "\n" num_and_lines = OrderedDict(enumerate(lines, 1)) for line_num, line in copy(num_and_lines).items(): - - if re.search(r'\\\s*(\n|\r|\r\n)$', line): - - num_and_lines[line_num + 1] = num_and_lines[line_num].strip()[:-1] + " " + \ - num_and_lines[line_num + 1] + if re.search(r"\\\s*(\n|\r|\r\n)$", line): + num_and_lines[line_num + 1] = ( + num_and_lines[line_num].strip()[:-1] + " " + num_and_lines[line_num + 1] + ) del num_and_lines[line_num] for line_number, line in num_and_lines.items(): @@ -318,7 +261,6 @@ def check_bel_script_line_by_line(bel_script_path: str, bel_version: str) -> Lis continue elif cached_line: - if not line.endswith("\\n"): try: cached_line += line @@ -342,9 +284,7 @@ def check_bel_script_line_by_line(bel_script_path: str, bel_version: str) -> Lis return errors - def check_bel_script(self, bel_script_path: str, - bel_version: str, - force_new_db: bool = False) -> dict: + def check_bel_script(self, bel_script_path: str, bel_version: str, force_new_db: bool = False) -> dict: """Check file with BEL script for syntax correctness. Parameters @@ -365,21 +305,21 @@ def check_bel_script(self, bel_script_path: str, if bel_version.startswith("2"): # TODO change this hardcoded value transformer = _BelTransformer() + else: logger.error(f"Transformer for version {bel_version} not implemented", exc_info=True) raise parser = Lark( grammar, - start='script', - parser='lalr', + start="script", + parser="lalr", lexer="contextual", - transformer=transformer + transformer=transformer, ) - with codecs.open(bel_script_path, 'r', encoding="utf-8") as fd: + with codecs.open(bel_script_path, "r", encoding="utf-8") as fd: bel_content = fd.read() + "\n" - fd.close() tree = None warnings = None @@ -393,9 +333,7 @@ def check_bel_script(self, bel_script_path: str, logger.info("UnexpectedInput exception in check_bel_script: %s" % exc) errors = self.check_bel_script_line_by_line(bel_script_path, bel_version) - return {'errors': errors, - 'tree': tree, - 'warnings': warnings} + return {"errors": errors, "tree": tree, "warnings": warnings} def write_error_report(data_frame: DataFrame, file_path: str) -> None: @@ -413,7 +351,7 @@ def write_error_report(data_frame: DataFrame, file_path: str) -> None: type Description of returned object. """ - if file_path.endswith('.xlsx'): + if file_path.endswith(".xlsx"): data_frame.to_excel(file_path) else: data_frame.to_csv(file_path) @@ -430,7 +368,7 @@ def write_warning_report(data_frame: DataFrame, file_path: str) -> None: file path to error report. """ - if file_path.endswith('.xlsx'): + if file_path.endswith(".xlsx"): data_frame.to_excel(file_path) else: data_frame.to_csv(file_path) @@ -459,9 +397,7 @@ def check_bel_script_line_by_line(bel_script_path, error_report_file_path, bel_v return data_frame -def check_bel_script(bel_script_path, - bel_version, - force_new_db=False) -> dict: +def check_bel_script(bel_script_path, bel_version, force_new_db=False) -> dict: """Check BEL script. Parameters @@ -482,7 +418,7 @@ def check_bel_script(bel_script_path, result = bel_parser.check_bel_script( bel_script_path=bel_script_path, force_new_db=force_new_db, - bel_version=bel_version + bel_version=bel_version, ) return result diff --git a/ebel/tools.py b/ebel/tools.py index 65e2c11..62fba6f 100644 --- a/ebel/tools.py +++ b/ebel/tools.py @@ -1,23 +1,21 @@ """General methods used by e(BE:L) modules.""" -import re +import configparser import gzip -import shutil import hashlib -import configparser - import os.path - +import re +import shutil from types import GeneratorType +from typing import Iterable, List, Union from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker from sqlalchemy.engine.base import Engine -from typing import Iterable, Union, List +from sqlalchemy.orm import sessionmaker from ebel import defaults -from ebel.config import write_to_config, get_config_value -from ebel.defaults import CONN_STR_DEFAULT +from ebel.config import get_config_value, write_to_config from ebel.constants import DATA_DIR +from ebel.defaults import CONN_STR_DEFAULT class BelRdb(object): @@ -31,7 +29,7 @@ def __new__(cls): BelRdb.__instance = object.__new__(cls) connection_string = _get_connection_string() dialect = re.search(r"^(\w+)(\+\w+)?:", connection_string).group(1) - if dialect == 'mysql': + if dialect == "mysql": utf8mb4 = "charset=utf8mb4" if utf8mb4 not in connection_string: connection_string = connection_string + f"?{utf8mb4}" @@ -44,7 +42,7 @@ def __new__(cls): def _get_connection_string(): """Get the sqlalchemy connection string from config file, sets the default string if not there.""" - return get_config_value('DATABASE', 'sqlalchemy_connection_string', CONN_STR_DEFAULT) + return get_config_value("DATABASE", "sqlalchemy_connection_string", CONN_STR_DEFAULT) def _get_engine() -> Engine: @@ -81,8 +79,8 @@ def get_standard_name(name: str) -> str: """Return standard name.""" part_of_name = [x for x in re.findall("[A-Z]*[a-z0-9]*", name) if x] new_name = "_".join(part_of_name).lower() - if re.search(r'^\d+', new_name): - new_name = '_' + new_name + if re.search(r"^\d+", new_name): + new_name = "_" + new_name return new_name @@ -110,8 +108,8 @@ def get_file_name(url_or_path): def gunzip(file_path: str, file_path_gunzipped: str): """Gunzip a file.""" - with gzip.open(file_path, 'rb') as f_in: - with open(file_path_gunzipped, 'wb') as f_out: + with gzip.open(file_path, "rb") as f_in: + with open(file_path_gunzipped, "wb") as f_out: shutil.copyfileobj(f_in, f_out) diff --git a/ebel/transformers.py b/ebel/transformers.py index 42ca887..6e112b4 100755 --- a/ebel/transformers.py +++ b/ebel/transformers.py @@ -1,15 +1,35 @@ """Transformer module for the transformation of lark trees.""" -import typing import logging +import re +import typing +from collections import Counter, OrderedDict, defaultdict, namedtuple +from typing import DefaultDict, Dict, Generator, List, Set -from lark.tree import Tree from lark import Transformer from lark.lexer import Token -from collections import defaultdict, namedtuple, OrderedDict +from lark.tree import Tree -from ebel.cache import _BelScript -from ebel.constants import FILE, URL, PATTERN, LIST +from ebel.cache import logger +from ebel.constants import ALLOWED_TYPES, FILE, GRAMMAR_START_ANNO, GRAMMAR_START_NS, LIST, PATTERN, URL +from ebel.errors import ( + NotDownloadedFromUrl, + NotInAnnotationList, + NotInAnnotationPattern, + NotInAnnotationUrl, + NotInNamespaceList, + NotInNamespacePattern, + NotInNamespaceUrl, + WithoutDefinedAnnotation, + WithoutDefinedNamespace, + _Error, +) +from ebel.manager.models import Annotation as AnnotationModel +from ebel.manager.models import AnnotationEntry, AnnotationManager +from ebel.manager.models import Namespace as NamespaceModel +from ebel.manager.models import NamespaceEntry, NamespaceManager, reset_tables +from ebel.tools import BelRdb +from ebel.warning_definitions import AlsoUsedInOtherNamespace, _Warning log = logging.getLogger(__name__) @@ -31,7 +51,6 @@ def get_token_dict2(token_tree_list: list) -> dict: ret_dict = defaultdict(list) for item in token_tree_list: - if isinstance(item, Token): ret_dict[item.type].append(item.value) @@ -66,232 +85,251 @@ class _BelTransformer(Transformer): - properties with * values => List[namedtuple] """ - exclude_token_types = ('OB', 'CB', 'QM', 'COLON', 'COMMA', 'OCB', 'CCB') - - nt_frag = namedtuple('fragment', ('range', 'descriptor')) - nt_nn = namedtuple('nn', ('namespace', 'name')) - nt_var = namedtuple('variant', ('hgvs',)) - nt_ma = namedtuple('ma', ('namespace', 'name', 'default')) - nt_pmod = namedtuple('protein_modification', ('namespace', 'name', 'type', 'amino_acid', 'position')) - nt_support = namedtuple('support', ('text', )) - nt_citation = namedtuple('citation', ('type', 'title', 'ref', 'pub_date', 'author_list', 'comment')) - nt_type = namedtuple('function', ('type', 'name')) - nt_gmod = namedtuple('gene_modification', ('namespace', 'name')) - named_tuples = (nt_frag, - nt_nn, - nt_var, - nt_pmod, - nt_support, - nt_citation, - nt_type, - nt_ma, - nt_gmod) + exclude_token_types = ("OB", "CB", "QM", "COLON", "COMMA", "OCB", "CCB") + + nt_frag = namedtuple("fragment", ("range", "descriptor")) + nt_nn = namedtuple("nn", ("namespace", "name")) + nt_var = namedtuple("variant", ("hgvs",)) + nt_ma = namedtuple("ma", ("namespace", "name", "default")) + nt_pmod = namedtuple("protein_modification", ("namespace", "name", "type", "amino_acid", "position")) + nt_support = namedtuple("support", ("text",)) + nt_citation = namedtuple("citation", ("type", "title", "ref", "pub_date", "author_list", "comment")) + nt_type = namedtuple("function", ("type", "name")) + nt_gmod = namedtuple("gene_modification", ("namespace", "name")) + named_tuples = ( + nt_frag, + nt_nn, + nt_var, + nt_pmod, + nt_support, + nt_citation, + nt_type, + nt_ma, + nt_gmod, + ) def __init__(self, force_new_db=False): Transformer.__init__(self) self.cache = _BelScript(force_new_db=force_new_db) self.citations = [] - def script(self, n): + @staticmethod + def script(n): return n - def statements_and_sets(self, n): - return {'statements_and_sets': n} + @staticmethod + def statements_and_sets(n): + return {"statements_and_sets": n} def abundance(self, n): """Return abundance.""" - return [self.nt_type('abundance', 'abundance'), n] + return [self.nt_type("abundance", "abundance"), n] def gene(self, n): """Return gene.""" - return [self.nt_type('abundance', 'gene'), n] + return [self.nt_type("abundance", "gene"), n] def micro_rna(self, n): """Return micro_rna.""" - return [self.nt_type('abundance', 'micro_rna'), n] + return [self.nt_type("abundance", "micro_rna"), n] def rna(self, n): """Return rna.""" - return [self.nt_type('abundance', 'rna'), n] + return [self.nt_type("abundance", "rna"), n] def protein(self, n): """Return protein.""" - return [self.nt_type('abundance', 'protein'), n] + return [self.nt_type("abundance", "protein"), n] def population(self, n): """Return protein.""" - return [self.nt_type('abundance', 'population'), n] + return [self.nt_type("abundance", "population"), n] def composite(self, n): """Return composite.""" - return [self.nt_type('list', 'composite'), sorted(n)] + return [self.nt_type("list", "composite"), sorted(n)] - def definitions(self, n): + @staticmethod + def definitions(n): """Return definitions of namespace and annotations.""" - return {'definitions': n} + return {"definitions": n} def sec(self, n): """Return cellSecretion.""" - return [self.nt_type('transformation', 'cell_secretion'), [n[0]]] + return [self.nt_type("transformation", "cell_secretion"), [n[0]]] def _ns_anno_props(self, n): """Return properties of namespace or annotation.""" - token_names = ('KEYWORD', ('URL', 'PATTERN', 'LIST'), ('URL_DEF', 'PATTERN_DEF', 'FILE_PATH')) + token_names = ( + "KEYWORD", + ("URL", "PATTERN", "LIST"), + ("URL_DEF", "PATTERN_DEF", "FILE_PATH"), + ) keyword, type_, value = self._get_values(n, token_names) value_list = [] - if type_ == 'LIST': - in_list = self._get_tree(n, 'in_list') - value_list = self._get_all_values_by_name(in_list, 'ENTRY') - - props = OrderedDict([ - ('keyword', keyword), - ('type', type_), - ('value', value), - ('value_list', sorted(value_list)) - ]) + if type_ == "LIST": + in_list = self._get_tree(n, "in_list") + value_list = self._get_all_values_by_name(in_list, "ENTRY") + + props = OrderedDict( + [ + ("keyword", keyword), + ("type", type_), + ("value", value), + ("value_list", sorted(value_list)), + ] + ) return props def namespace(self, n): """Return namespace vertex.""" token_dict = get_token_dict2(n) - keyword = token_dict['KEYWORD'][0] + keyword = token_dict["KEYWORD"][0] namespace_type, value = None, None - if 'LIST' in token_dict: - value = token_dict['ENTRY'] + if "LIST" in token_dict: + value = token_dict["ENTRY"] namespace_type = LIST - elif 'URL_DEF' in token_dict: - value = token_dict['URL_DEF'][0] + elif "URL_DEF" in token_dict: + value = token_dict["URL_DEF"][0] namespace_type = URL - elif 'FILE_PATH' in token_dict: - value = token_dict['FILE_PATH'][0] + elif "FILE_PATH" in token_dict: + value = token_dict["FILE_PATH"][0] namespace_type = FILE - elif 'PATTERN_DEF' in token_dict: - value = token_dict['PATTERN_DEF'][0] + elif "PATTERN_DEF" in token_dict: + value = token_dict["PATTERN_DEF"][0] namespace_type = PATTERN self.cache.set_namespace_definition(namespace_type, keyword, value) props = self._ns_anno_props(n) - return {'namespace': props} + return {"namespace": props} def annotation(self, n): """Return namespace vertex.""" token_dict = get_token_dict2(n) - keyword = token_dict['KEYWORD'][0] + keyword = token_dict["KEYWORD"][0] annotation_type, value = None, None - if 'LIST' in token_dict: - value = token_dict['ENTRY'] + if "LIST" in token_dict: + value = token_dict["ENTRY"] annotation_type = LIST - elif 'URL_DEF' in token_dict: - value = token_dict['URL_DEF'][0] + elif "URL_DEF" in token_dict: + value = token_dict["URL_DEF"][0] annotation_type = URL - elif 'FILE_PATH' in token_dict: - value = token_dict['FILE_PATH'][0] + elif "FILE_PATH" in token_dict: + value = token_dict["FILE_PATH"][0] annotation_type = FILE - elif 'PATTERN_DEF' in token_dict: - value = token_dict['PATTERN_DEF'][0] + elif "PATTERN_DEF" in token_dict: + value = token_dict["PATTERN_DEF"][0] annotation_type = PATTERN self.cache.set_annotation_definition(annotation_type, keyword, value) props = self._ns_anno_props(n) - return {'annotation': props} + return {"annotation": props} - def document(self, n): + @staticmethod + def document(n): """Return Document vertex.""" - return {'document': OrderedDict(sorted(n))} + return {"document": OrderedDict(sorted(n))} def _doc_prop(self, name, n): - return (name, self._get_values(n, (('STRING_IN_QUOTES', 'WORD'),))[0]) + return (name, self._get_values(n, (("STRING_IN_QUOTES", "WORD"),))[0]) def document_name(self, n): """Return document property as tuple.""" - return self._doc_prop('name', n) + return self._doc_prop("name", n) def document_description(self, n): """Return document property as tuple.""" - return self._doc_prop('description', n) + return self._doc_prop("description", n) def document_version(self, n): """Return document property as tuple.""" - return self._doc_prop('version', n) + return self._doc_prop("version", n) def document_authors(self, n): """Return document property as tuple.""" - return self._doc_prop('authors', n) + return self._doc_prop("authors", n) def document_contact_info(self, n): """Return document property as tuple.""" - return self._doc_prop('contact_info', n) + return self._doc_prop("contact_info", n) def document_copyright(self, n): """Return document property as tuple.""" - return self._doc_prop('copyright', n) + return self._doc_prop("copyright", n) def document_licences(self, n): """Return document property as tuple.""" - return self._doc_prop('licences', n) + return self._doc_prop("licences", n) def document_keywords(self, n): """Return document property as tuple.""" - return self._doc_prop('keywords', n) + return self._doc_prop("keywords", n) def deg(self, n): """Return degradation.""" - return [self.nt_type('transformation', 'degradation'), [n[0]]] + return [self.nt_type("transformation", "degradation"), [n[0]]] + + def complex_obj(self, n): + """Return correct complex type.""" + return self.subject(n) def complex_abundance(self, n): """Return complex as abundance.""" - return [self.nt_type('abundance', 'complex'), [n[0]]] + return [self.nt_type("abundance", "complex"), [n[0]]] def complex_list(self, n): """Return complex as list.""" - return [self.nt_type('list', 'complex'), n] + return [self.nt_type("list", "complex"), n] - def list_complex(self, n): + @staticmethod + def list_complex(n): """Return abundance list of complex.""" return sorted(n) def list(self, n): """Return list.""" - function_type = self._format_sub_obj([self.nt_type('list', 'list')])[0] - return {'object': [function_type, [self._format_sub_obj(x) for x in sorted(n)]]} + function_type = self._format_sub_obj([self.nt_type("list", "list")])[0] + return {"object": [function_type, [self._format_sub_obj(x) for x in sorted(n)]]} def act2(self, n): """Return activity.""" - abundance = self._get_tree(n, 'act_abundance').children[0] - ma = self._get_dict_value(n, 'ma') + abundance = self._get_tree(n, "act_abundance").children[0] + ma = self._get_dict_value(n, "ma") if ma: - return [self.nt_type('process', 'activity'), abundance, ma._asdict()] + return [self.nt_type("process", "activity"), abundance, ma._asdict()] else: - return [self.nt_type('process', 'activity'), abundance] + return [self.nt_type("process", "activity"), abundance] def act(self, n): """Return activity.""" - return [self.nt_type('process', 'activity'), n] + return [self.nt_type("process", "activity"), n] def tloc(self, n): """Return tloc.""" - return [self.nt_type('transformation', 'translocation'), n] + return [self.nt_type("transformation", "translocation"), n] - def _get_rel(self, n): - return {'relation': n[0].data} + @staticmethod + def _get_rel(n): + return {"relation": n[0].data} - def nested_relation(self, n): + @staticmethod + def nested_relation(n): """Return nested relation.""" - return {'nested_relation': n[0]} + return {"nested_relation": n[0]} def relation_with_list(self, n): """Return relation.""" @@ -333,66 +371,82 @@ def transl_relation(self, n): """Return relation.""" return self._get_rel(n) - def has_members(self, n): - """Return relation.""" - return {'relation': 'has_members'} + @staticmethod + def has_component(n): + """Return has_component relation.""" + return {"relation": "has_component"} + + @staticmethod + def has_components(n): + """Return has_components relation.""" + return {"relation": "has_components"} + + @staticmethod + def has_members(n): + """Return has_member relation.""" + return {"relation": "has_members"} - def statement(self, n: list): + @staticmethod + def statement(n: list): """Return statement.""" - return {'statement': n} + return {"statement": n} def subject(self, n): """Return subject.""" - return {'subject': self._format_sub_obj(n[0])} + return {"subject": self._format_sub_obj(n[0])} def nested_subject(self, n): """Return nested subject.""" - return {'nested_subject': self._format_sub_obj(n[0])} + return {"nested_subject": self._format_sub_obj(n[0])} def nested_object(self, n): """Return nested subject.""" - return {'nested_object': self._format_sub_obj(n[0])} + return {"nested_object": self._format_sub_obj(n[0])} - def sets(self, n): + @staticmethod + def sets(n): """Return sets.""" - return {'sets': n} + return {"sets": n} - def evidence(self, n): + @staticmethod + def evidence(n): """Return support. support formily known as evidence, supprtingText """ - return {'evidence': n[0].value} + return {"evidence": n[0].value} def citation(self, n): """Return citation.""" tv_dict = {x.type: x.value for x in n if isinstance(x, Token)} - c_title, c_pubdate, c_author_list, c_comment = '', '', '', '' - c_type = tv_dict['C_TYPE'] + c_title, c_pubdate, c_author_list, c_comment = "", "", "", "" + c_type = tv_dict["C_TYPE"] if "ONLY_2_PARAMETERS" in tv_dict: - c_ref = tv_dict['C_PARAM2'] + c_ref = tv_dict["C_PARAM2"] else: - c_title = tv_dict['C_PARAM2'] - c_ref = tv_dict['C_PARAM3'] - c_pubdate = tv_dict['C_PUBDATE'] if 'C_PUBDATE' in tv_dict else '' - c_author_list = tv_dict['C_AUTHORLIST'] if 'C_AUTHORLIST' in tv_dict else '' - c_comment = tv_dict['C_COMMENT'] if 'C_COMMENT' in tv_dict else '' - - nt = self.nt_citation(type=c_type, - title=c_title, - ref=c_ref, - pub_date=c_pubdate, - author_list=c_author_list, - comment=c_comment) + c_title = tv_dict["C_PARAM2"] + c_ref = tv_dict["C_PARAM3"] + c_pubdate = tv_dict["C_PUBDATE"] if "C_PUBDATE" in tv_dict else "" + c_author_list = tv_dict["C_AUTHORLIST"] if "C_AUTHORLIST" in tv_dict else "" + c_comment = tv_dict["C_COMMENT"] if "C_COMMENT" in tv_dict else "" + + nt = self.nt_citation( + type=c_type, + title=c_title, + ref=c_ref, + pub_date=c_pubdate, + author_list=c_author_list, + comment=c_comment, + ) - self.citations.append({'citation_type': c_type, 'citation_id': c_ref}) + self.citations.append({"citation_type": c_type, "citation_id": c_ref}) - return {'citation': nt._asdict()} + return {"citation": nt._asdict()} def statement_group(self, n): """Return statement group.""" - return {'statement_group': self._get_value(n, 'GROUP_NAME')} + return {"statement_group": self._get_value(n, "GROUP_NAME")} def set_annotation(self, n): """Return annotation sets.""" @@ -405,40 +459,36 @@ def set_annotation(self, n): entries.append(tkn) for entry in entries: - self.cache.set_annotation_entry( - annotation=annotation_key, - entry=entry.value, - token=entry - ) + self.cache.set_annotation_entry(annotation=annotation_key, entry=entry.value, token=entry) - keyword = self._get_value(n, 'KEYWORD') - entries = self._get_all_values_by_name(n, 'ANNO_SET_ENTRY') - return {'set': {keyword: entries}} + keyword = self._get_value(n, "KEYWORD") + entries = self._get_all_values_by_name(n, "ANNO_SET_ENTRY") + return {"set": {keyword: entries}} def unset(self, n): """Return unsets.""" - keywords = self._get_all_values_by_name(n, 'ANNO_KEYWORD') - return {'unset': keywords} + keywords = self._get_all_values_by_name(n, "ANNO_KEYWORD") + return {"unset": keywords} def object(self, n): """Return object.""" - return {'object': self._format_sub_obj(n[0])} + return {"object": self._format_sub_obj(n[0])} def molec_process(self, n): """Return molecular process.""" - return {'molec_process': self._format_sub_obj(n[0])} + return self.object(n) def act_or_abundance(self, n): """Return activity or abundance.""" - return {'act_or_abundance': self._format_sub_obj(n[0])} + return {"act_or_abundance": self._format_sub_obj(n[0])} def transformation(self, n): """Return transformation.""" - return {'transformation': self._format_sub_obj(n[0])} + return {"transformation": self._format_sub_obj(n[0])} def pat(self, n): """Return object.""" - return {'pat': self._format_sub_obj(n[0])} + return self.subject(n) def transc_subject(self, n): return self.subject(n) @@ -452,16 +502,24 @@ def transl_subject(self, n): def transl_object(self, n): return self.object(n) + def ortho_subject(self, n): + """Return orthologous gene subject.""" + return self.subject(n) + + def ortho_object(self, n): + """Return orthologous gene object.""" + return self.object(n) + def surf(self, n): - return [self.nt_type('transformation', 'cell_surface_expression'), n] + return [self.nt_type("transformation", "cell_surface_expression"), n] def bp(self, n): """Return biological process.""" - return [self.nt_type('process', 'biological_process'), [n[0]]] + return [self.nt_type("process", "biological_process"), [n[0]]] def path(self, n): """Return path.""" - return [self.nt_type('process', 'pathology'), [n[0]]] + return [self.nt_type("process", "pathology"), [n[0]]] def _format_sub_obj(self, obj): """Change namedtuples to tuple(name_of_namedtuple,dictionary).""" @@ -481,172 +539,179 @@ def protein_changes(self, n): """Create dict of protein changes.""" return self.changes(n) - def statement_comment(self, n): - return {'statement_comment': ' '.join([x.value for x in n]).strip()} + @staticmethod + def statement_comment(n): + return {"statement_comment": " ".join([x.value for x in n]).strip()} - def changes(self, n): + @staticmethod + def changes(n): """Create dict of abundance changes.""" return [list((x[0], [x[1]])) for x in sorted(list(set(n)))] - def gene_changes(self, n): + @staticmethod + def gene_changes(n): """Return gene abundance changes; Only valid in BEL 2.1.""" return [list((x[0], [x[1]])) for x in sorted(list(set(n)))] def gmod(self, n): """Return gmod; Only valid in BEL 2.1.""" - nn = [x for x in n if x.__class__.__name__ == 'nn'] + nn = [x for x in n if x.__class__.__name__ == "nn"] namespace = "TestNS" name = "TestName" if nn: namespace = nn[0].namespace name = nn[0].name - nt = self.nt_gmod( - namespace=namespace, - name=name) + nt = self.nt_gmod(namespace=namespace, name=name) - return self.nt_type('modifier', 'gmod'), nt + return self.nt_type("modifier", "gmod"), nt def ma(self, n): """Transform tokens to dict(function_name: namedtuple).""" namespace, name, default = (None,) * 3 - if type(n[0]).__name__ == 'nn': + if type(n[0]).__name__ == "nn": nt = n[0] namespace = nt.namespace name = nt.name - elif n[0].data == 'ma_default': + elif n[0].data == "ma_default": default = n[0].children[0].data return self.nt_ma(namespace=namespace, name=name, default=default) def frag(self, n) -> tuple: """Transform frag tokens to nametuple.""" - frag_range, frag_descriptor = self._get_values(n, ('F_RANGE', 'F_DESCRIPTOR')) + frag_range, frag_descriptor = self._get_values(n, ("F_RANGE", "F_DESCRIPTOR")) nt = self.nt_frag(range=frag_range, descriptor=frag_descriptor) - return self.nt_type('modifier', 'fragment'), nt + return self.nt_type("modifier", "fragment"), nt def loc(self, n) -> tuple: """Transform tokens to dict(function_name: namedtuple).""" - return self.nt_type('modifier', 'location'), n[0] + return self.nt_type("modifier", "location"), n[0] def var(self, n) -> tuple: """Transform tokens to dict(function_name: namedtuple).""" - return self.nt_type('modifier', 'variant'), self.nt_var(n[0].value) + return self.nt_type("modifier", "variant"), self.nt_var(n[0].value) def nn(self, n): """Transform tokens to dict(function_name: namedtuple).""" - token_names = ('NAMESPACE_KEYWORD', ('NAME_WITHOUT_QUOTES', 'STRING_SIMPLE')) + token_names = ("NAMESPACE_KEYWORD", ("NAME_WITHOUT_QUOTES", "STRING_SIMPLE")) namespace, name = self._get_values(n, token_names) token_dict = {x.type: x for x in n} - name_with_quotes = token_dict.get('NAME_WITHOUT_QUOTES') - simple_string = token_dict.get('STRING_SIMPLE') + name_with_quotes = token_dict.get("NAME_WITHOUT_QUOTES") + simple_string = token_dict.get("STRING_SIMPLE") entry_token = name_with_quotes or simple_string - self.cache.set_namespace_entry( - namespace=namespace, - entry=name, - token=entry_token - ) + self.cache.set_namespace_entry(namespace=namespace, entry=name, token=entry_token) return self.nt_nn(namespace=namespace, name=name) - def pmod(self, n) -> dict: + def pmod(self, n) -> tuple: """Transform tokens to dict(function_name: namedtuple).""" - namespace, name = '', '' + namespace, name = "", "" - pos_value = self._get_value(n, 'POSITION') + pos_value = self._get_value(n, "POSITION") position = int(pos_value) if pos_value else 0 - nn = [x for x in n if x.__class__.__name__ == 'nn'] + nn = [x for x in n if x.__class__.__name__ == "nn"] if nn: namespace = nn[0].namespace name = nn[0].name - aa = self._get_dict_value(n, 'amino_acid') - amino_acid = aa if aa else '' + aa = self._get_dict_value(n, "amino_acid") + amino_acid = aa if aa else "" - ptype = self._get_dict_value(n, 'pmod_type') - type_ = ptype if ptype else '' + ptype = self._get_dict_value(n, "pmod_type") + type_ = ptype if ptype else "" nt = self.nt_pmod( namespace=namespace, name=name, type=type_, amino_acid=amino_acid, - position=position) + position=position, + ) - return self.nt_type('modifier', 'pmod'), nt + return self.nt_type("modifier", "pmod"), nt - def amino_acid(self, n): + @staticmethod + def amino_acid(n): """Return amino acid.""" - return {'amino_acid': n[0].data.split('_')[1].upper()} + return {"amino_acid": n[0].data.split("_")[1].upper()} - def pmod_type(self, n): + @staticmethod + def pmod_type(n): """Return pmod_type.""" - return {'pmod_type': n[0].data} + return {"pmod_type": n[0].data} def from_loc(self, n): """Return tloc.""" - return [self.nt_type('translocation', 'from_location'), n] + return [self.nt_type("translocation", "from_location"), n] def to_loc(self, n): """Return tloc.""" - return [self.nt_type('translocation', 'to_location'), n] + return [self.nt_type("translocation", "to_location"), n] def rxn(self, n): """Return reaction.""" - return [self.nt_type('transformation', 'reaction'), n] + return [self.nt_type("transformation", "reaction"), n] def reactants(self, n): """Return reactants.""" - return [self.nt_type('reaction_partner', 'reactants'), sorted(n)] + return [self.nt_type("reaction_partner", "reactants"), sorted(n)] def products(self, n): """Return products.""" - return [self.nt_type('reaction_partner', 'products'), sorted(n)] + return [self.nt_type("reaction_partner", "products"), sorted(n)] - def fusion(self, n): + @staticmethod + def fusion(n): """Return fusion.""" return n[0] def _fusion_range(self, n): """Return fusion.""" - range_types = ('GENE_FUSION_RANGE', 'RNA_FUSION_RANGE', 'PROTEIN_FUSION_RANGE') + range_types = ("GENE_FUSION_RANGE", "RNA_FUSION_RANGE", "PROTEIN_FUSION_RANGE") fusion_range = self._get_value(n, range_types) - return OrderedDict([('fusion_range', fusion_range)]) + return OrderedDict([("fusion_range", fusion_range)]) def gene_fusion(self, n): """Return gene fusion.""" - return [self.nt_type('other', 'fusion_gene'), n] + return [self.nt_type("other", "fusion_gene"), n] def gene_fusion_range(self, n): """Return fusion range.""" return self._fusion_range(n) - def fus_gene1(self, n): - return ['gene1', n] + @staticmethod + def fus_gene1(n): + return ["gene1", n] - def fus_gene2(self, n): - return ['gene2', n] + @staticmethod + def fus_gene2(n): + return ["gene2", n] - def fus_rna1(self, n): - return ['rna1', n] + @staticmethod + def fus_rna1(n): + return ["rna1", n] - def fus_rna2(self, n): - return ['rna2', n] + @staticmethod + def fus_rna2(n): + return ["rna2", n] - def fus_protein1(self, n): - return ['protein1', n] + @staticmethod + def fus_protein1(n): + return ["protein1", n] - def fus_protein2(self, n): - return ['protein2', n] + @staticmethod + def fus_protein2(n): + return ["protein2", n] def rna_fusion(self, n): """Return rna fusion.""" - return [self.nt_type('other', 'fusion_rna'), n] + return [self.nt_type("other", "fusion_rna"), n] def rna_fusion_range(self, n): """Return fusion range.""" @@ -654,23 +719,26 @@ def rna_fusion_range(self, n): def protein_fusion(self, n): """Return protein fusion.""" - return [self.nt_type('other', 'fusion_protein'), n] + return [self.nt_type("other", "fusion_protein"), n] def protein_fusion_range(self, n): """Return fusion range.""" return self._fusion_range(n) - def _get_dict_value(self, n, key_name): + @staticmethod + def _get_dict_value(n, key_name): """Return value of key_name for first dict with key_name.""" for e in n: if isinstance(e, dict) and key_name in e: return e[key_name] - def __get_token_dict(self, tokens: typing.List[Token]) -> dict: + @staticmethod + def __get_token_dict(tokens: typing.List[Token]) -> dict: """Get dictionary of tokens with typs as key.""" return {t.type: t.value for t in tokens if isinstance(t, Token)} - def _get_value(self, tokens, token_name): + @staticmethod + def _get_value(tokens, token_name): """Get first Token value of list with token_name. If token_name is a list method checks aginst list. @@ -691,12 +759,11 @@ def _get_values(self, tokens, token_names: typing.Iterable) -> list: ret_lst = [] for t_name in token_names: - if isinstance(t_name, str): - ret_lst.append(t_dict.get(t_name, '')) + ret_lst.append(t_dict.get(t_name, "")) elif isinstance(t_name, typing.Iterable): - value = '' + value = "" for name in t_name: value = t_dict.get(name) if value: @@ -705,7 +772,8 @@ def _get_values(self, tokens, token_names: typing.Iterable) -> list: return ret_lst - def _get_tree(self, n: list, tree_name): + @staticmethod + def _get_tree(n: list, tree_name): """Return first lark.Tree, namedtuple or dict linked to tree_name.""" for e in n: if isinstance(e, Tree) and e.data == tree_name: @@ -715,15 +783,739 @@ def _get_tree(self, n: list, tree_name): raise "Type not supported or empty list" - def _get_all_values_by_name(self, tree_or_tokens, token_name): + @staticmethod + def _get_all_values_by_name(tree_or_tokens, token_name): n = tree_or_tokens if isinstance(tree_or_tokens, Tree): n = tree_or_tokens.children return sorted([x.value for x in n if isinstance(x, Token) and x.type == token_name]) - def _first_tree(self, n: list) -> Tree: + @staticmethod + def _first_tree(n: list) -> Tree: """Return first tree in list.""" for e in n: if isinstance(e, Tree): return e + + +class _BelScript: + """Cache the content of the BEL script and methods to find errors and warnings.""" + + def __init__(self, force_new_db): + """Init.""" + # setup database + engine = BelRdb().engine + + self.force_new_db = force_new_db + reset_tables(engine, self.force_new_db) + + self._namespaces = Namespaces() # entries Namespace objects + self._annotations = Annotations() # entries Annotation objects + + self.__namespace_in_db_updated = False + self.__annotations_in_db_updated = False + + self._namespace_entries = NamespaceEntries() + self._annotation_entries = AnnotationEntries() + + self.notDownloadedFromUrls = [] + + self.namespace_manager = NamespaceManager( + model=NamespaceModel, + entries_model=NamespaceEntry, + grammar_start=GRAMMAR_START_NS, + ) + + self.annotation_manager = AnnotationManager( + model=AnnotationModel, + entries_model=AnnotationEntry, + grammar_start=GRAMMAR_START_ANNO, + ) + + def set_namespace_definition(self, as_type, keyword, value): + """Set an annotation definition with type, keyword and value value could be 'file', 'url' or 'list'. + + :param str as_type: 'file', 'url' or 'list' + :param str keyword: namespace keyword + :param str value: URL, file path or list + """ + if as_type in ALLOWED_TYPES: + self._namespaces.add(as_type, keyword, value) + return True + else: + logger.error("{} is not a allowed type of {}".format(as_type, ALLOWED_TYPES)) + return False + + def set_annotation_definition(self, as_type, keyword, value): + """Set an annotation definition with type, keyword and value could be 'file', 'url' or 'list'. + + :param str as_type: 'file', 'url' or 'list' + :param str keyword: namespace keyword + :param str value: URL, file path or list + """ + if as_type in ALLOWED_TYPES: + self._annotations.add(as_type, keyword, value) + return True + else: + logger.error("{} is not an allowed type of {}".format(as_type, ALLOWED_TYPES)) + return False + + def set_annotation_entry(self, annotation: str, entry: str, token: Token): + """Set annotation, entry and lark.lexer.Token token. + + :param str annotation: annotation + :param str entry: entry + :param lark.lexer.Token token: + """ + self._annotation_entries.set_annotation_entry(keyword=annotation, entry=entry, token=token) + + def set_namespace_entry(self, namespace: str, entry: str, token: Token): + """Set namespace, entry and lark.lexer.Token token. + + :param str namespace: + :param str entry: + :param lark.lexer.Token token: + """ + if not isinstance(token, Token): + raise Exception("expecting Token in cache.set_namespace_entry") + + self._namespace_entries.set_namespace_entry(keyword=namespace, entry=entry, token=token) + + @property + def errors(self) -> List[_Error]: + """Execute all methods to find errors and warnings.""" + self.update_database() + + # all errors are children from errors._Error instances + return ( + self.notDownloadedFromUrls + + self.entries_without_namespace + + self.entries_without_annotation + + self.entries_not_in_namespace_url + + self.entries_not_in_annotation_url + + self.entries_not_in_namespace_list + + self.entries_not_in_annotation_list + + self.entries_not_in_namespace_pattern + + self.entries_not_in_annotation_pattern + ) + + @property + def warnings(self) -> List[_Warning]: + """Execute all methods to find warnings.""" + if not (self.__namespace_in_db_updated and self.__annotations_in_db_updated): + self.update_database() + + # all warnings are children from warnings._Warning instances + return self.entries_also_in_other_namespace + + @property + def entries_also_in_other_namespace(self) -> List[AlsoUsedInOtherNamespace]: + """Return WithoutDefinedNamespace list.""" + ret = [] + # extract all entries used in BEL statements and create a dict of lower entries with all keywords + entry_keyword_dict = defaultdict(set) + for keyword1, entries in self._namespace_entries.entries.items(): + for entry in entries: + entry_keyword_dict[entry.lower()] |= {keyword1} + # identify all ambiguous entries (in more than 1 namespace) + ambiguous_entries = {entry: keywords for entry, keywords in entry_keyword_dict.items() if len(keywords) > 1} + + # ToDo: iterate all lower entries an check for permutation + # for lower_entry in entry_keyword_dict: + # if lower_entry.count(",") == 1: + # reverse_without_comma = " ".join([x.strip() for x in lower_entry.split(",")][::-1]) + # if reverse_without_comma in entry_keyword_dict: + # print(lower_entry, + # "%s exists in %s" % (reverse_without_comma, + # entry_keyword_dict[reverse_without_comma])) + # ret.append(AlsoUsedInOtherNamespace(keyword=keyword2, + # entry=entry, + # line_number=token.line, + # column=token.column, + # hint=hint)) + + # iterate all tokens with namespace entries and check if they are also exists in ambiguous entries + for keyword2, entries_tokens in self._namespace_entries.tokens.items(): + for entry, tokens in entries_tokens.items(): + if entry.lower() in ambiguous_entries: + ambiguous_tokens = self._namespace_entries.tokens[keyword2][entry] + for token in ambiguous_tokens: + hint = "%s exists also in %s" % ( + entry, + ambiguous_entries[entry.lower()] - {keyword2}, + ) + ret.append( + AlsoUsedInOtherNamespace( + keyword=keyword2, + entry=entry, + line_number=token.line, + column=token.column, + hint=hint, + ) + ) + return ret + + @property + def entries_not_in_namespace_pattern(self) -> List[NotInNamespacePattern]: + """Return a list of entries not fitting a given namespace pattern.""" + ret = [] + + ns_pattern_kwds = self.used_namespace_keywords & self._namespaces.keywords_by_type(PATTERN) + + for kwd in ns_pattern_kwds: + regex = self._namespaces.keyword_dict[kwd].value + pattern = re.compile("^" + regex + "$") + elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(kwd) + for entry, line, column in elcs: + if not pattern.search(entry): + ret.append(NotInNamespacePattern(keyword=kwd, entry=entry, line_number=line, column=column)) + return ret + + @property + def entries_not_in_annotation_pattern(self) -> List[NotInAnnotationPattern]: + """Return a list of entries not fitting a given annotation pattern.""" + ret = [] + + anno_pattern_kwds = self.used_annotation_keywords & self._annotations.keywords_by_type(PATTERN) + + for kwd in anno_pattern_kwds: + regex = self._annotations.keyword_dict[kwd].value + pattern = re.compile("^" + regex + "$") + elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(kwd) + for entry, line, column in elcs: + if not pattern.search(entry): + ret.append(NotInAnnotationPattern(keyword=kwd, entry=entry, line_number=line, column=column)) + return ret + + @property + def entries_not_in_annotation_list(self) -> List[NotInAnnotationList]: + """Return a list of entries not in a given annotations.""" + ret = [] + + anno_kwd_used_and_as_list = self.used_annotation_keywords & self._annotations.keywords_by_type(LIST) + + for kwd in anno_kwd_used_and_as_list: + elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(kwd) + for entry, line, column in elcs: + if entry not in self._annotations.keyword_dict[kwd].value: + ret.append(NotInAnnotationList(keyword=kwd, entry=entry, line_number=line, column=column)) + return ret + + @property + def entries_not_in_namespace_list(self) -> List[NotInNamespaceList]: + """Return a list of entries not in a given namespace.""" + ret = [] + + ns_kwd_used_and_as_list = self.used_namespace_keywords & self._namespaces.keywords_by_type(LIST) + + for kwd in ns_kwd_used_and_as_list: + elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(kwd) + for entry, line, column in elcs: + if entry not in self._namespaces.keyword_dict[kwd].value: + ret.append(NotInNamespaceList(keyword=kwd, entry=entry, line_number=line, column=column)) + return ret + + @property + def entries_without_namespace(self) -> List[WithoutDefinedNamespace]: + """Return WithoutDefinedNamespace list.""" + ret = [] + for namespace_keyword in self.namespaces_without_definition: + elcs = self._namespace_entries.get_entry_line_column_list_by_keyword(namespace_keyword) + for entry, line, column in elcs: + ret.append( + WithoutDefinedNamespace( + keyword=namespace_keyword, + entry=entry, + line_number=line, + column=column, + ) + ) + return ret + + @property + def entries_without_annotation(self) -> List[WithoutDefinedAnnotation]: + """Return WithoutDefinedNamespace list.""" + ret = [] + for annotation_keyword in self.annotations_without_definition: + elcs = self._annotation_entries.get_entry_line_column_list_by_keyword(annotation_keyword) + for entry, line, column in elcs: + ret.append( + WithoutDefinedAnnotation( + keyword=annotation_keyword, + entry=entry, + line_number=line, + column=column, + ) + ) + return ret + + def update_database(self) -> None: + """Update namespace and annotation entries in database if not exists by url and keyword.""" + if not (self.__namespace_in_db_updated and self.__annotations_in_db_updated): + self.__namespace_in_db_updated = self.update_namespaces_in_db() + self.__annotations_in_db_updated = self.update_annotations_in_db() + + def set_entry_not_in_namespace_list_errors(self): + pass + + def set_entry_not_in_annotation_list_errors(self): + pass + + @property + def entries_not_in_namespace_url(self) -> List[NotInNamespaceUrl]: + """Return a list of entries not exists in namespaces referenced as URL. + + Returns + ------- + List[NotInNamespaceUrl] + Description of returned object. + """ + entries_not_in_namespace = [] + + for keyword in self.used_namespace_keywords: + namespace = self._namespaces.keyword_dict[keyword] + + if namespace.as_type == URL: + url = namespace.value + elc_list = self._namespace_entries.get_entry_line_column_list_by_keyword(keyword) + + names_not_exists = self.namespace_manager.get_entries_not_exists( + keyword=keyword, + url=url, + entry_line_column_list=elc_list, + ) + + for entry, line, column, hint in names_not_exists: + error = NotInNamespaceUrl( + keyword=keyword, + url_or_path=url, + entry=entry, + line_number=line, + column=column, + hint=hint, + ) + entries_not_in_namespace.append(error) + + return entries_not_in_namespace + + @property + def entries_not_in_annotation_url(self) -> List[_Error]: + """Return a list of entries not in the annotation URL.""" + entries_not_in_annotation = [] + + for keyword in self.used_annotation_keywords: + annotation = self._annotations.keyword_dict[keyword] + + if annotation.as_type == URL: + url = annotation.value + elc_list = self._annotation_entries.get_entry_line_column_list_by_keyword(keyword) + + names_not_exists = self.annotation_manager.get_entries_not_exists( + keyword=keyword, url=url, entry_line_column_list=elc_list + ) + + for entry, line, column, hint in names_not_exists: + error = NotInAnnotationUrl( + keyword=keyword, + url_or_path=url, + entry=entry, + line_number=line, + column=column, + hint=hint, + ) + entries_not_in_annotation.append(error) + return entries_not_in_annotation + + def update_annotations_in_db(self) -> bool: + """Update annotation in database if URL and keyword not exists.""" + import_success = True + for anno in self._annotations.to_update: + if anno.keyword in self.used_annotation_keywords: + if not self.annotation_manager.keyword_url_exists(keyword=anno.keyword, url=anno.value): + if anno.as_type == URL: + logger.info(f"Update db with annotation {anno.keyword}: download from {anno.value}") + + ( + successful, + error, + ) = self.annotation_manager.save_from_url_or_path( + keyword=anno.keyword, + url_or_path=anno.value, + doc_type=anno.as_type, + ) + + if not successful: + import_success = False + error_args = error.args[0].split("\n") + string_error = error_args[2] if len(error_args) > 1 else error_args[0] + logger.error( + f"Annotation {anno.keyword} failed to be added from {anno.value}", + exc_info=False, + ) + + if "column" in dir(error): # Indicates it's a Lark error + download_error = NotDownloadedFromUrl( + keyword=anno.keyword, + url_or_path=anno.value, + column=error.column, + line=error.line, + hint=f'{error.allowed} error in "{string_error}"', + ) + + else: # It's an HTTPError of some kind + download_error = NotDownloadedFromUrl( + keyword=anno.keyword, + url_or_path=anno.value, + column=0, + line=0, + hint=f"{string_error}", + ) + self.notDownloadedFromUrls.append(download_error) + + return import_success + + def update_namespaces_in_db(self) -> bool: + """Update namespaces in database if URL and keyword does not exist.""" + import_success = True + for ns in self._namespaces.to_update: + if ns.keyword in self.used_namespace_keywords: + if not self.namespace_manager.keyword_url_exists(keyword=ns.keyword, url=ns.value): + if ns.as_type == URL: + logger.info(f"Update db with namespace {ns.keyword}: download from {ns.value}") + + ( + successful, + error, + ) = self.namespace_manager.save_from_url_or_path( + keyword=ns.keyword, + url_or_path=ns.value, + doc_type=ns.as_type, + ) + + if not successful: + import_success = False + error_args = error.args[0].split("\n") + string_error = error_args[2] if len(error_args) > 1 else error_args[0] + logger.error( + f"Namespace {ns.keyword} failed to be added from {ns.value}", + exc_info=False, + ) + + if "column" in dir(error): # Indicates it's a Lark error + download_error = NotDownloadedFromUrl( + keyword=ns.keyword, + url_or_path=ns.value, + column=error.column, + line=error.line, + hint=f'{error.allowed} error in "{string_error}"', + ) + + else: # It's an HTTPError of some kind + download_error = NotDownloadedFromUrl( + keyword=ns.keyword, + url_or_path=ns.value, + column=0, + line=0, + hint=f"{string_error}", + ) + + self.notDownloadedFromUrls.append(download_error) + + return import_success + + @property + def namespaces_with_multiple_definitions(self): + """Return all Namespace objects with several definitions. + + This is returned as a dictionary (key:keyword, value: list of Namespace objects). + """ + ret = defaultdict(list) + multiple_keyword = [k for k, v in Counter(self._namespaces.keywords).items() if v > 1] + for ns in self._namespaces: + if ns.keyword in multiple_keyword: + ret[ns.keyword].append(ns) + return dict(ret) + + @property + def annotations_with_multiple_definitions(self): + """Return all Annotation objects with several definitions. + + This is returned as a dictionary (key:keyword, value: list of Annotation objects). + """ + ret = defaultdict(list) + multiple_keyword = [k for k, v in Counter(self._annotations.keywords).items() if v > 1] + for anno in self._annotations: + if anno.keyword in multiple_keyword: + ret[anno.keyword].append(anno) + return dict(ret) + + @property + def namespaces_without_definition(self): + """Return set of namespace keywords used in statements but not defined with a reference. + + :return set: set of str + """ + return set(self._namespace_entries.keywords) - set(self._namespaces.keywords) + + @property + def annotations_without_definition(self): + """Return a set of annotation keywords not defined with a reference. + + :return set: set of str + """ + return set(self._annotation_entries.keywords) - set(self._annotations.keywords) + + @property + def used_namespace_keywords(self) -> Set[str]: + """Return set of used namespace keywords (with reference and used in statements).""" + return set(self._namespace_entries.keywords) & set(self._namespaces.keywords) + + @property + def used_annotation_keywords(self) -> Set[str]: + """Return set of used namespace keywords.""" + return set(self._annotation_entries.keywords) & set(self._annotations.keywords) + + @property + def namespace_keywords_in_statements(self): + """Return all unique namespace keywords used in statements.""" + return self._namespace_entries.keywords + + @property + def annotation_keywords_in_statements(self): + """Return all unique annotation keywords used in statements.""" + return self._namespace_entries.keywords + + def get_entries_by_namespace_keyword(self, keyword): + """Get all entries by namespace keyword. + + :param keyword: namespace keyword + :return set: all entries in the namespace + """ + return self._namespace_entries.get_entries_by_keyword(keyword) + + def get_entries_by_annotation_keyword(self, keyword): + """Get all entries by namespace keyword. + + :param keyword: namespace keyword + :return set: all entries in the namespace + """ + return self._annotation_entries.get_entries_by_keyword(keyword) + + +class Entries: + """Abstract class representing namespaces and annotations.""" + + tokens = defaultdict(dict) + entries = defaultdict(set) + + def get_entry_line_column_list_by_keyword(self, keyword: str) -> Generator[str, int, int]: + """Get generator of tuple(entry, line, column) by keyword. + + Parameters + ---------- + keyword: str + Description of parameter `keyword: str`. + + Returns + ------- + Generator + Generator of tuple(entry: str, line: int, column: int). + """ + for entry, tokens in self.tokens[keyword].items(): + for token in tokens: + yield entry, token.line, token.column + + @property + def keywords(self): + """Return a list of unique keywords used in SETs.""" + return self.entries.keys() + + def get_entries_by_keyword(self, keyword: str) -> Set: + """Get entries by keyword. + + :param str keyword: keyword to retrieve from dict + """ + return self.entries.get(keyword, set()) + + def get_tokens_by_keyword(self, keyword: str) -> Dict: + """Get tokens by keyword. + + :param str keyword: keyword to retrieve from dict + """ + return self.entries.get(keyword, set()) + + def __str__(self): + """String representation of object.""" + return str(dict(self.tokens)) + + +class NamespaceEntries(Entries): + """Namespace subclass of Entries.""" + + def __init__(self): + """Init.""" + self.entries = defaultdict(set) + self.tokens = defaultdict(dict) + + def set_namespace_entry(self, keyword, entry, token): + """Set namespace, entry and lark.lexer.Token. + + :param str keyword: namespace + :param str entry: entry + :param lark.lexer.Token token: Token object from lark library + """ + if isinstance(token, Token): + self.entries[keyword] |= {entry} + if keyword in self.tokens and entry in self.tokens[keyword]: + self.tokens[keyword][entry].append(token) + else: + self.tokens[keyword][entry] = [token] + else: + raise "Argument token is type {} not {}".format(type(token), "lark.lexer.Token") + + +class AnnotationEntries(Entries): + """Annotation subclass of Entries.""" + + def __init__(self): + """Init.""" + self.entries = defaultdict(set) + self.tokens = defaultdict(dict) + + def set_annotation_entry(self, keyword: str, entry, token): + """Set annotation, entry and lark.lexer.Token. + + :param str keyword: annotation + :param entry: entry + :param lark.lexer.Token token: Token object from lark library + """ + if isinstance(token, Token): + self.entries[keyword] |= {entry} + if keyword in self.tokens and entry in self.tokens[keyword]: + self.tokens[keyword][entry].append(token) + else: + self.tokens[keyword][entry] = [token] + else: + raise "argument token is type {} not {}".format(type(token), "lark.lexer.Token") + + +class NsAnsBase: + """Parent class for class Namespace and Annotation.""" + + def __init__(self, obj_class): + """Init.""" + self.__objs = [] + self.class_ = obj_class + + def add(self, as_type: str, keyword: str, value: str): + """Add obj to list of objs. + + :param str as_type: allowed keywords 'file', 'url' or 'list' + :param str keyword: keyword used in object + :param str value: value of object + :return: + """ + obj = self.class_(as_type, keyword, value) + self.__objs.append(obj) + + @property + def type_dict(self) -> DefaultDict: + """Convert to list of dictionaries.""" + ret = defaultdict(list) + [ret[obj.as_type].append(obj) for obj in self] + return ret + + def by_type(self, as_type: str): + """Return list of Namespace objects by 'list', 'url' or 'file'.""" + if as_type not in ALLOWED_TYPES: + raise "{} not in allowed types {}".format(as_type, ALLOWED_TYPES) + return [obj for obj in self if obj.as_type == as_type] + + def keywords_by_type(self, as_type: str) -> Set[str]: + """Return a set of keywords by Namespace type 'list', 'url' or 'file'.""" + if as_type not in ALLOWED_TYPES: + raise "{} not in allowed types {}".format(as_type, ALLOWED_TYPES) + return set([obj.keyword for obj in self if obj.as_type == as_type]) + + @property + def keyword_dict(self) -> Dict: + """Return a dictionary of key=keyword, value: Namespace or Annotation object.""" + ret = dict() + for obj in self: + ret[obj.keyword] = obj + return ret + + @property + def keywords(self) -> List[str]: + """Return all keywords.""" + return [obj.keyword for obj in self.__objs] + + @property + def to_update(self) -> List: + """Return a list of all Namespace or Annotation (NS_or_Anno) objects with URL or file path. + + :return list: list of all Namespace or Annotation (NS_or_Anno) objects with URL or file path + """ + return self.type_dict[URL] + + def __iter__(self): + """Return a generator of objects (Namespace or Annotation).""" + for obj in self.__objs: + yield obj + + +class Namespaces(NsAnsBase): + """Namespace child class.""" + + def __init__(self): + """init.""" + super(Namespaces, self).__init__(obj_class=Namespace) + + +class Annotations(NsAnsBase): + """Annotation child class.""" + + def __init__(self): + """init.""" + super(Annotations, self).__init__(obj_class=Annotation) + + +class Namespace: + """Namespace class to represent BEL statement namespaces.""" + + def __init__(self, as_type, keyword, value): + """Namespace init.""" + self.as_type = as_type + self.keyword = keyword + self.value = value + + def to_dict(self): + """Convert class values to dictionary.""" + return {"as_type": self.as_type, "keyword": self.keyword, "value": self.value} + + def __unicode__(self): + return "Namespace:" + str(self.to_dict()) + + def __str__(self): + return self.__unicode__() + + +class Annotation: + """Annotation class to represent BEL statement annotations.""" + + def __init__(self, as_type, keyword, value): + """Annotation init.""" + self.as_type = as_type + self.keyword = keyword + self.value = value + + def to_dict(self): + """Convert class values to dictionary.""" + return {"as_type": self.as_type, "keyword": self.keyword, "value": self.value} + + def __unicode__(self): + return "Annotation" + str(self.to_dict()) + + def __str__(self): + return self.__unicode__() diff --git a/ebel/validate.py b/ebel/validate.py index 6bde4c4..68630c4 100644 --- a/ebel/validate.py +++ b/ebel/validate.py @@ -1,34 +1,34 @@ """Collect of methods used for validating a BEL file.""" -import os -import re import csv import difflib import logging - -from typing import Iterable, Union, Optional -from textwrap import fill +import os +import re from pathlib import Path +from textwrap import fill +from typing import Iterable, Optional, Union import numpy as np import pandas as pd import ebel.database from ebel.errors import BelSyntaxError -from ebel.parser import check_bel_script_line_by_line, check_bel_script, bel_to_json - +from ebel.parser import bel_to_json, check_bel_script, check_bel_script_line_by_line logger = logging.getLogger(__name__) -def validate_bel_file(bel_script_path: Union[str, Path], - force_new_db: bool = False, - line_by_line: bool = False, - reports: Union[Iterable[str], str] = None, - bel_version: str = '2_1', - tree: bool = False, - sqlalchemy_connection_str: str = None, - json_file: bool = True, - force_json: bool = False,): +def validate_bel_file( + bel_script_path: Union[str, Path], + force_new_db: bool = False, + line_by_line: bool = False, + reports: Union[Iterable[str], str] = None, + bel_version: str = "2_1", + tree: bool = False, + sqlalchemy_connection_str: str = None, + json_file: bool = True, + force_json: bool = False, +): """Validate BEL script for correct syntax following eBNF grammar. Parameters @@ -74,9 +74,9 @@ def validate_bel_file(bel_script_path: Union[str, Path], if line_by_line: # TODO: This is perhaps not working - result = check_bel_script_line_by_line(bel_script_path, - error_report_file_path=reports, - bel_version=bel_version) + result = check_bel_script_line_by_line( + bel_script_path, error_report_file_path=reports, bel_version=bel_version + ) if reports: logger.info("Wrote report to %s\n" % reports) @@ -88,7 +88,7 @@ def validate_bel_file(bel_script_path: Union[str, Path], ebel.database.set_connection(sqlalchemy_connection_str) bel_files = _create_list_bel_files(bel_path=bel_script_path) - validation_results['bel_files_checked'] = bel_files + validation_results["bel_files_checked"] = bel_files for bel_file in bel_files: # Create dict to be filled for individual BEL files. @@ -104,45 +104,48 @@ def validate_bel_file(bel_script_path: Union[str, Path], if json_file: if result["errors"]: if force_json: # Check for syntax errors - bel_syntax_error_present = result['errors'] and any( + bel_syntax_error_present = result["errors"] and any( [isinstance(error_type, BelSyntaxError) for error_type in result["errors"]] ) if bel_syntax_error_present: logger.error("Cannot force JSON file due to syntax errors. Please check the BEL file.") else: - json_file = _write_odb_json(bel_path=bel_file, results=result, bel_version=bel_version) - validation_results[bel_file]['json'] = json_file + json_file = _write_odb_json( + bel_path=bel_file, + results=result, + bel_version=bel_version, + ) + validation_results[bel_file]["json"] = json_file else: logger.error("Unable to create JSON file due to grammar/syntax errors in BEL file") else: # No errors so everything is fine json_file = _write_odb_json(bel_path=bel_file, results=result, bel_version=bel_version) - validation_results[bel_file]['json'] = json_file + validation_results[bel_file]["json"] = json_file if tree: - if result['errors']: + if result["errors"]: logger.error("Tree can not be printed because errors still exists\n") else: - logger.debug(result['tree']) - validation_results[bel_file]['tree'] = result['tree'] + logger.debug(result["tree"]) + validation_results[bel_file]["tree"] = result["tree"] - if result['warnings'] and reports: - report_paths = _write_report(reports, result, report_type='warnings') - validation_results[bel_file]['reports'] = report_paths - - elif result['errors']: + if result["warnings"] and reports: + report_paths = _write_report(reports, result, report_type="warnings") + validation_results[bel_file]["reports"] = report_paths + elif result["errors"]: if not reports: - logger.info('\n'.join([x.to_string() for x in result['errors']]) + "\n") + logger.info("\n".join([x.to_string() for x in result["errors"]]) + "\n") else: - _write_report(reports, result, report_type='errors') + _write_report(reports, result, report_type="errors") -def repair_bel_file(bel_script_path: str, new_file_path: Optional[str] = None): +def repair_bel_file(bel_script_path: str, new_file_path: str, diff: bool = False): """Repair a BEL document. Parameters @@ -151,17 +154,20 @@ def repair_bel_file(bel_script_path: str, new_file_path: Optional[str] = None): Path to the BEL file. new_file_path : str (optional) Export repaired version of file to new path. + diff : bool (optional) + Also export a file showing the differences between the original and the repaired files. """ # if evidence: # regular expression for missing continuous line (\ at the end of line) with open(bel_script_path, "r", encoding="utf-8") as belfile: content = belfile.read() - new_content = content + new_content = replace_ebel_relation_terms(content) - for regex_pattern in re.findall(r'\n((SET\s+(DOCUMENT\s+Description|Evidence|SupportingText|Support)' - r'\s*=\s*)"(((?<=\\)"|[^"])+)"\s*\n*)', - content): + for regex_pattern in re.findall( + r"\n((SET\s+(DOCUMENT\s+Description|Evidence|SupportingText|Support)" r'\s*=\s*)"(((?<=\\)"|[^"])+)"\s*\n*)', + content, + ): if regex_pattern[2].startswith("DOCUMENT"): new_prefix = "SET DOCUMENT Description = " else: @@ -169,27 +175,61 @@ def repair_bel_file(bel_script_path: str, new_file_path: Optional[str] = None): new_evidence_text = re.sub(r"(\\?[\r\n]+)|\\ ", " ", regex_pattern[3].strip()) new_evidence_text = re.sub(r"\s{2,}", " ", new_evidence_text) - new_evidence_text = re.sub(r'(\\)(\w)', r'\g<2>', new_evidence_text) + new_evidence_text = re.sub(r"(\\)(\w)", r"\g<2>", new_evidence_text) new_evidence_text = fill(new_evidence_text, break_long_words=False).replace("\n", " \\\n") new_evidence = new_prefix + '"' + new_evidence_text + '"\n\n' new_content = new_content.replace(regex_pattern[0], new_evidence) if content != new_content: - if new_file_path: - with open(new_file_path + ".diff2repaired", "w") as new_file: - new_file.write('\n'.join(list(difflib.ndiff(content.split("\n"), new_content.split("\n"))))) - - else: - with open(bel_script_path, "w") as output_file: - output_file.write(new_content) - - -def _write_odb_json(bel_path: str, results: dict, bel_version: str) -> str: - json_path = bel_path + ".json" + if diff: + with open(bel_script_path + ".diff2repaired", "w", encoding="utf-8") as new_file: + new_file.write("\n".join(list(difflib.ndiff(content.split("\n"), new_content.split("\n"))))) + + with open(new_file_path, "w", encoding="utf-8") as output_file: + output_file.write(new_content) + + +def replace_ebel_relation_terms(bel_file_content: str): + """Replace the eBEL substituted relation terms for the proper BEL relation names e.g. directly_increases + to directlyIncreases.""" + repaired_content = bel_file_content + ebel_to_bel_map = { + "analogous_to": "analogousTo", + "biomarker_for": "biomarkerFor", + "causes_no_change": "causesNoChange", + "directly_decreases": "directlyDecreases", + "directly_increases": "directlyIncreases", + "equivalent_to": "eq", + "has_component": "hasComponent", + "has_components": "hasComponents", + "has_member": "hasMember", + "has_members": "hasMembers", + "is_a": "isA", + "negative_correlation": "neg", + "positive_correlation": "pos", + "prognostic_biomarker_for": "prognosticBiomarkerFor", + "rate_limiting_step_of": "rateLimitingStepOf", + "sub_process_of": "subProcessOf", + "transcribed_to": "transcribedTo", + "translated_to": "translatedTo", + } + + # Can use simple string replace since eBEL terms are quite unique + for ebel_term, bel_term in ebel_to_bel_map.items(): + repaired_content = repaired_content.replace(ebel_term, bel_term) + + return repaired_content + + +def _write_odb_json(bel_path: Union[str, Path], results: dict, bel_version: str) -> str: + if isinstance(bel_path, str): + bel_path = Path(bel_path) + json_path = bel_path.with_suffix(".bel.json") if int(bel_version[0]) > 1: - json_tree = bel_to_json(results['tree']) - open(json_path, "w").write(json_tree) + json_tree = bel_to_json(results["tree"]) + with open(json_path, "w") as jf: + jf.write(json_tree) return json_path @@ -224,13 +264,21 @@ def _write_report(reports: Union[Iterable[str], str], result: dict, report_type: Returns ------- list - List of file paths for the reports written. + File paths for the reports written. """ # TODO: report_type options should be constants errors_or_warns_as_list_of_dicts = [x.to_dict() for x in result[report_type]] - columns = [report_type[:-1] + "_class", "url", "keyword", "entry", "line_number", "column", "hint"] + columns = [ + report_type[:-1] + "_class", + "url", + "keyword", + "entry", + "line_number", + "column", + "hint", + ] df = pd.DataFrame(data=errors_or_warns_as_list_of_dicts, columns=columns) df.index += 1 @@ -239,10 +287,10 @@ def _write_report(reports: Union[Iterable[str], str], result: dict, report_type: for report in reports: try: - if report.endswith('.csv'): + if report.endswith(".csv"): df.to_csv(report, index=False) - if report.endswith('.xls'): + if report.endswith(".xls"): try: df.to_excel(report, index=False) @@ -250,41 +298,55 @@ def _write_report(reports: Union[Iterable[str], str], result: dict, report_type: logger.warning("Max Excel sheet size exceeded. Writing to CSV instead.") df.to_csv(report, index=False) - if report.endswith('.xlsx'): + if report.endswith(".xlsx"): try: - df.to_excel(report, engine='xlsxwriter', index=False) + df.to_excel(report, engine="xlsxwriter", index=False) except ValueError: logger.warning("Max Excel sheet size exceeded. Writing to CSV instead.") df.to_csv(report, index=False) - if report.endswith('.tsv'): - df.to_csv(report, sep='\t', index=False) + if report.endswith(".tsv"): + df.to_csv(report, sep="\t", index=False) - if report.endswith('.json'): + if report.endswith(".json"): df.to_json(report, index=False) - if report.endswith('.txt'): + if report.endswith(".txt"): open(report, "w").write(df.to_string(index=False)) - if report.endswith('.html'): + if report.endswith(".html"): df.to_html(report, index=False) - if report.endswith('.md'): + if report.endswith(".md"): cols = df.columns - df2 = pd.DataFrame([['---', ] * len(cols)], columns=cols) + df2 = pd.DataFrame( + [ + [ + "---", + ] + * len(cols) + ], + columns=cols, + ) if df.hint.dtype == np.str: - df.hint = df.hint.str.replace(r'\|', '|') + df.hint = df.hint.str.replace(r"\|", "|") if df.entry.dtype == np.str: - df.entry = df.entry.str.replace(r'\|', '|') + df.entry = df.entry.str.replace(r"\|", "|") - df.url = [("[url](" + str(x) + ")" if not pd.isna(x) else '') for x in df.url] + df.url = [("[url](" + str(x) + ")" if not pd.isna(x) else "") for x in df.url] url_template = "[%s](" + report.split(".bel.")[0] + ".bel?expanded=true&viewer=simple#L%s)" df.line_number = [url_template % (x, x) for x in df.line_number] df3 = pd.concat([df2, df]) - df3.to_csv(report, sep="|", index=False, quoting=csv.QUOTE_NONE, escapechar="\\") + df3.to_csv( + report, + sep="|", + index=False, + quoting=csv.QUOTE_NONE, + escapechar="\\", + ) except PermissionError: logger.error("Previous version of error report is still open and cannot be overwritten. Unable to update.") diff --git a/ebel/warnings.py b/ebel/warning_definitions.py old mode 100755 new mode 100644 similarity index 72% rename from ebel/warnings.py rename to ebel/warning_definitions.py index afb17eb..634f4b7 --- a/ebel/warnings.py +++ b/ebel/warning_definitions.py @@ -1,8 +1,9 @@ """Warning class definitions.""" from collections import OrderedDict -TEMPLATE = '{error_class}\tkeyword:{keyword}\tentry:{entry}\tline:{line_number}\tcolumn:{column}' \ - '\turl:{url}\thint:{hint}' +TEMPLATE = ( + "{error_class}\tkeyword:{keyword}\tentry:{entry}\tline:{line_number}\tcolumn:{column}" "\turl:{url}\thint:{hint}" +) class _Warning: @@ -10,19 +11,21 @@ class _Warning: def __init__(self): self.class_name = self.__class__.__name__ - self.value_dict = OrderedDict([ - ("warning_class", self.class_name), - ("url", None), - ("keyword", None), - ("entry", None), - ("line_number", None), - ("column", None), - ("hint", None) - ]) + self.value_dict = OrderedDict( + [ + ("warning_class", self.class_name), + ("url", None), + ("keyword", None), + ("entry", None), + ("line_number", None), + ("column", None), + ("hint", None), + ] + ) def to_dict(self): """Format the properties of error into a dictionary.""" - raise NotImplementedError('to_dict have to be implemented in {}'.format(self.__class__.__name__)) + raise NotImplementedError("to_dict have to be implemented in {}".format(self.__class__.__name__)) def to_string(self) -> str: self.value_dict.update(self.to_dict()) @@ -60,5 +63,5 @@ def to_dict(self) -> dict: "entry": self.entry, "line_number": self.line_number, "column": self.column, - "hint": self.hint + "hint": self.hint, } diff --git a/ebel/web/api/__init__.py b/ebel/web/api/__init__.py index a31ebe5..56f3678 100644 --- a/ebel/web/api/__init__.py +++ b/ebel/web/api/__init__.py @@ -1,9 +1,10 @@ """Base methods for API.""" -from sqlalchemy.orm import Session -from sqlalchemy import create_engine from typing import Dict, List, Union + from pymysql.converters import escape_string +from sqlalchemy import create_engine +from sqlalchemy.orm import Session from sqlalchemy_utils import create_database, database_exists from ebel import Bel @@ -38,7 +39,10 @@ def get_session(): class OdbRequest: """OrientDB class definition for interfacing with the ODB server.""" - def __init__(self, request_query_dict: Dict[str, Dict[str, Dict[str, Union[str, int, bool, float, None]]]]): + def __init__( + self, + request_query_dict: Dict[str, Dict[str, Dict[str, Union[str, int, bool, float, None]]]], + ): """Init method.""" self.__request_query_dict = request_query_dict if self.validate(): @@ -47,7 +51,9 @@ def __init__(self, request_query_dict: Dict[str, Dict[str, Dict[str, Union[str, raise TypeError("RequestQuery must be initialized with Dict[str, Dict[str, Dict[str, str]]]") @property - def odb_classes(self) -> Dict[str, Dict[str, Dict[str, Union[str, int, bool, float, None]]]]: + def odb_classes( + self, + ) -> Dict[str, Dict[str, Dict[str, Union[str, int, bool, float, None]]]]: """Return ODB classes: structure: {'odb_class', {'odb_column': {'option':str, 'value': str}, ...}}}.""" return self.__odb_classes @@ -55,13 +61,18 @@ def validate(self): """Check if request_query_dict is correct.""" validated: bool = False if isinstance(self.__request_query_dict, Dict): - for odb_class, column_params in self.__request_query_dict.items(): # k=ODB class, v=columns + for ( + odb_class, + column_params, + ) in self.__request_query_dict.items(): # k=ODB class, v=columns if isinstance(odb_class, str) and isinstance(column_params, Dict): for column_name, value_option_dict in column_params.items(): if isinstance(value_option_dict, Dict): - keywords_exists = {'value', 'option'}.issubset(set(value_option_dict.keys())) - value_types_ok = isinstance(value_option_dict['value'], - (str, int, bool, float, type(None))) + keywords_exists = {"value", "option"}.issubset(set(value_option_dict.keys())) + value_types_ok = isinstance( + value_option_dict["value"], + (str, int, bool, float, type(None)), + ) if keywords_exists and value_types_ok: validated = True return validated @@ -96,7 +107,7 @@ def get_sql(self): sql_list.append(sql_class.format(odb_class=odb_class)) for column in columns: sql_list.append(f"{odb_class}.{column} as {odb_class}__{column.replace('.', '_')}") - return ', '.join(sql_list) + return ", ".join(sql_list) class ValueOption: @@ -128,32 +139,32 @@ def get_operator_value_sql_string(self, dialect: Union[str, None] = None) -> Uni """ if self.value: if isinstance(self.value, str): - search_value = escape_string(self.value.strip(), 'utf-8') + search_value = escape_string(self.value.strip(), "utf-8") else: search_value = self.value - if self.option in ['=', 'exact']: - operator = '=' + if self.option in ["=", "exact"]: + operator = "=" - elif self.option == 'regular expression': - operator = 'RLIKE' + elif self.option == "regular expression": + operator = "RLIKE" else: - operator = 'LIKE' + operator = "LIKE" option_choices = { "exact": f"'{self.value}'", "contains": f"'%{self.value}%'", "starts with": f"'{self.value}%'", "ends with": f"'{self.value}%'", - "regular expression": f"'{self.value}'" + "regular expression": f"'{self.value}'", } if self.option in option_choices: search_value = option_choices[self.option] - elif self.option in ['>', '>=', '<=', '<']: + elif self.option in [">", ">=", "<=", "<"]: operator = self.option elif operator == "LIKE": @@ -162,9 +173,7 @@ def get_operator_value_sql_string(self, dialect: Union[str, None] = None) -> Uni return f"{operator} {search_value}" -def get_sql_match(odb_request: OdbRequest, - columns_dict: Dict[str, List[str]], - match_template: str) -> str: +def get_sql_match(odb_request: OdbRequest, columns_dict: Dict[str, List[str]], match_template: str) -> str: """Return an OrientDB match string. Example: @@ -198,11 +207,11 @@ def get_sql_match(odb_request: OdbRequest, if odb_request.odb_classes.get(odb_class): where_array = [] for column, values in odb_request.odb_classes[odb_class].items(): - if values['value']: + if values["value"]: operator_value_string = ValueOption(**values).get_operator_value_sql_string() if operator_value_string: where_array.append(f"`{column}` {operator_value_string}") - where_dict[odb_class] = ", where:(" + " and ".join(where_array) + ")" if where_array else '' + where_dict[odb_class] = ", where:(" + " and ".join(where_array) + ")" if where_array else "" sql_match = match_template.format(**where_dict) return sql_match @@ -241,6 +250,6 @@ def get_odb_options(column: str, query_request: OdbRequest, class_column_dict, s ({sql_match} return {column} as option) where option IS NOT NULL and option!='' order by option""" - results = [str(x.oRecordData.get('option')) for x in client.command(query_string)] + results = [str(x.oRecordData.get("option")) for x in client.command(query_string)] return results diff --git a/ebel/web/api/constants.py b/ebel/web/api/constants.py index fa16f31..17dc920 100644 --- a/ebel/web/api/constants.py +++ b/ebel/web/api/constants.py @@ -1,58 +1,58 @@ """API server string constants.""" select_bel_columns_list = [ - 'out.name as subject_name', - 'out.label as subject_label', - 'out.namespace as subject_namespace', - 'out.bel as subject_bel', - 'out.involved_genes as subject_involved_genes', - 'out.involved_other as subject_involved_other', - 'out.@class as subject_class', - 'out.@rid.asString() as subject_id', - '@class as relation', - 'pmid', - '@rid.asString() as edge_id', - 'evidence', - 'citation.last_author as last_author', - 'citation.title as title', - 'citation.pub_date as publication_date', - 'annotation', - 'in.name as object_name', - 'in.label as object_label', - 'in.namespace as object_namespace', - 'in.bel as object_bel', - 'in.involved_genes as object_involved_genes', - 'in.involved_other as object_involved_other', - 'in.@class as object_class', - 'in.@rid.asString() as object_id' + "out.name as subject_name", + "out.label as subject_label", + "out.namespace as subject_namespace", + "out.bel as subject_bel", + "out.involved_genes as subject_involved_genes", + "out.involved_other as subject_involved_other", + "out.@class as subject_class", + "out.@rid.asString() as subject_id", + "@class as relation", + "pmid", + "@rid.asString() as edge_id", + "evidence", + "citation.last_author as last_author", + "citation.title as title", + "citation.pub_date as publication_date", + "annotation", + "in.name as object_name", + "in.label as object_label", + "in.namespace as object_namespace", + "in.bel as object_bel", + "in.involved_genes as object_involved_genes", + "in.involved_other as object_involved_other", + "in.@class as object_class", + "in.@rid.asString() as object_id", ] -select_bel_columns = ', '.join(select_bel_columns_list) +select_bel_columns = ", ".join(select_bel_columns_list) match_bel_columns_list = [ - 's.bel as subject_bel', - 's.involved_genes as subject_involved_genes', - 's.involved_other as subject_involved_other', - 's.@class as subject_class', - 's.@rid.asString() as subject_id', - 's.name as subject_name', - 's.label as subject_label', - 's.namespace as subject_namespace', - 'r.@class as relation', - 'r.pmid as pmid', - 'r.@rid.asString() as edge_id', - 'r.evidence as evidence', - 'r.citation.last_author as last_author', - 'r.citation.title as title', - 'r.citation.pub_date as publication_date', - 'r.annotation as annotation', - 'o.bel as object_bel', - 'o.involved_genes as object_involved_genes', - 'o.involved_other as object_involved_other', - 'o.@class as object_class', - 'o.@rid.asString() as object_id', - 'o.name as object_name', - 'o.label as object_label', - 'o.namespace as object_namespace', + "s.bel as subject_bel", + "s.involved_genes as subject_involved_genes", + "s.involved_other as subject_involved_other", + "s.@class as subject_class", + "s.@rid.asString() as subject_id", + "s.name as subject_name", + "s.label as subject_label", + "s.namespace as subject_namespace", + "r.@class as relation", + "r.pmid as pmid", + "r.@rid.asString() as edge_id", + "r.evidence as evidence", + "r.citation.last_author as last_author", + "r.citation.title as title", + "r.citation.pub_date as publication_date", + "r.annotation as annotation", + "o.bel as object_bel", + "o.involved_genes as object_involved_genes", + "o.involved_other as object_involved_other", + "o.@class as object_class", + "o.@rid.asString() as object_id", + "o.name as object_name", + "o.label as object_label", + "o.namespace as object_namespace", ] -match_bel_columns = ', '.join(match_bel_columns_list) +match_bel_columns = ", ".join(match_bel_columns_list) diff --git a/ebel/web/api/ebel/v1/__init__.py b/ebel/web/api/ebel/v1/__init__.py index d6372e5..d1088fd 100644 --- a/ebel/web/api/ebel/v1/__init__.py +++ b/ebel/web/api/ebel/v1/__init__.py @@ -5,8 +5,7 @@ from collections import namedtuple from enum import Enum from math import ceil -from typing import Dict, Type -from typing import List +from typing import Dict, List, Type from flask import request from sqlalchemy import inspect, not_ @@ -17,7 +16,7 @@ from ebel import Bel from ebel.web.api import RDBMS -Pagination = namedtuple('Pagination', ['page', 'page_size', 'skip']) +Pagination = namedtuple("Pagination", ["page", "page_size", "skip"]) class OrientDbSqlOperator(Enum): @@ -45,28 +44,31 @@ class DataType(Enum): class SubRelObj(Enum): """String constant definitions.""" - SUBJECT = 's' - RELATION = 'r' - OBJECT = 'o' + SUBJECT = "s" + RELATION = "r" + OBJECT = "o" def _get_pagination() -> Pagination: """Get page and page_size from request.""" request_obj = request.args if request.args else json.loads(request.data) - page_size = request_obj.get('page_size', 10) + page_size = request_obj.get("page_size", 10) page_size = int(page_size) if (isinstance(page_size, int) or re.search(r"^\d+$", page_size)) else 10 - page = request_obj.get('page', 1) + page = request_obj.get("page", 1) page = int(page) if (isinstance(page, int) or re.search(r"^\d+$", page)) else 1 skip = (page - 1) * page_size return Pagination(page=page, page_size=page_size, skip=skip) -def _get_data(model: Type[DeclarativeMeta], print_sql=False, order_by: List[InstrumentedAttribute] = []): +def _get_data( + model: Type[DeclarativeMeta], + print_sql=False, + order_by: List[InstrumentedAttribute] = [], +): columns: Dict[str, InstrumentedAttribute] = { - col_name: col_obj for col_name, col_obj in model.__dict__.items() - if isinstance(col_obj, InstrumentedAttribute) + col_name: col_obj for col_name, col_obj in model.__dict__.items() if isinstance(col_obj, InstrumentedAttribute) } - bool_map = {'true': 1, 'false': 0} + bool_map = {"true": 1, "false": 0} request_obj = request.args if request.args else json.loads(request.data) params = {k: (bool_map[v] if v in bool_map else v) for k, v in request_obj.items() if k in columns and v} like_queries = [columns[k].like(v) for k, v in params.items()] @@ -74,8 +76,12 @@ def _get_data(model: Type[DeclarativeMeta], print_sql=False, order_by: List[Inst return _get_paginated_query_result(query, print_sql=print_sql, order_by=order_by) -def _get_paginated_query_result(query: Query, return_dict=False, print_sql=False, - order_by: List[InstrumentedAttribute] = []): +def _get_paginated_query_result( + query: Query, + return_dict=False, + print_sql=False, + order_by: List[InstrumentedAttribute] = [], +): """Return paginated query result if sqlalchemy model have as_dict method. Method requires page_size and page in `request.args` @@ -83,7 +89,7 @@ def _get_paginated_query_result(query: Query, return_dict=False, print_sql=False p = _get_pagination() if not (p.page and p.page_size): - return {'error': "Please add page and page_size to your method."} + return {"error": "Please add page and page_size to your method."} number_of_results = query.count() limit = int(p.page_size) @@ -105,11 +111,11 @@ def _get_paginated_query_result(query: Query, return_dict=False, print_sql=False results = [x.as_dict() for x in q.all()] return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': results + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": results, } @@ -119,9 +125,9 @@ def _get_paginated_ebel_query_result(sql: str, print_sql=False): p = _get_pagination() if not (p.page and p.page_size): - return {'error': "Please add page and page_size to your method."} + return {"error": "Please add page and page_size to your method."} - number_of_results = b.query_get_dict(f"SELECT count(*) from ({sql})")[0]['count'] + number_of_results = b.query_get_dict(f"SELECT count(*) from ({sql})")[0]["count"] limit = int(p.page_size) page = int(p.page) offset = (page - 1) * limit @@ -133,11 +139,11 @@ def _get_paginated_ebel_query_result(sql: str, print_sql=False): print(sql_paginated) return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': [x for x in b.query_get_dict(sql_paginated)] + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": [x for x in b.query_get_dict(sql_paginated)], } @@ -163,16 +169,16 @@ def _get_terms_from_model_like(form_field: str, sa_column: InstrumentedAttribute """ search_term, page, page_size = None, None, None if by == "args": - page_size = request.args.get('page_size', 10) - page = request.args.get('page', 1) + page_size = request.args.get("page_size", 10) + page = request.args.get("page", 1) search_term = request.args.get(form_field, None) elif by == "data": - page_size = request.data.get('page_size', 10) - page = request.data.get('page', 1) + page_size = request.data.get("page_size", 10) + page = request.data.get("page", 1) search_term = request.data.get(form_field, None) if not (search_term and page and page_size): - return {'error': f"{form_field} is required."} + return {"error": f"{form_field} is required."} if how == "starts_with": search_term = f"{search_term}%" @@ -183,11 +189,9 @@ def _get_terms_from_model_like(form_field: str, sa_column: InstrumentedAttribute model = sa_column.class_ primary_key = inspect(model).primary_key[0] - query = RDBMS.get_session().query( - primary_key, sa_column - ).filter( - sa_column.like(f"{search_term}%") - ).order_by(sa_column) + query = ( + RDBMS.get_session().query(primary_key, sa_column).filter(sa_column.like(f"{search_term}%")).order_by(sa_column) + ) number_of_results = query.count() @@ -198,11 +202,11 @@ def _get_terms_from_model_like(form_field: str, sa_column: InstrumentedAttribute pages = ceil(number_of_results / limit) return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': {x[1]: x[0] for x in query.limit(limit).offset(offset).all()} + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": {x[1]: x[0] for x in query.limit(limit).offset(offset).all()}, } @@ -210,36 +214,36 @@ def add_query_filters(query: Query, columns_params: Dict[str, Dict[str, str]], m """Add optional filters to query.""" col_filters = [] for column_name, v in columns_params.items(): - if v.get('how2search') and v.get('value'): - how2search = v['how2search'] + if v.get("how2search") and v.get("value"): + how2search = v["how2search"] print(how2search) - value = v['value'].strip() if isinstance(v['value'], str) else v['value'] + value = v["value"].strip() if isinstance(v["value"], str) else v["value"] column = inspect(model).columns[column_name] - if how2search in ('exact', 'exact_numeric'): + if how2search in ("exact", "exact_numeric"): col_filters.append(column == value) - elif how2search == 'starts_with': - col_filters.append(column.like(value + '%')) - elif how2search == 'ends_with': - col_filters.append(column.like('%' + value)) - elif how2search == 'contains': - col_filters.append(column.like('%' + value + '%')) - elif how2search == 'greater_than': + elif how2search == "starts_with": + col_filters.append(column.like(value + "%")) + elif how2search == "ends_with": + col_filters.append(column.like("%" + value)) + elif how2search == "contains": + col_filters.append(column.like("%" + value + "%")) + elif how2search == "greater_than": col_filters.append(column.__gt__(value)) - elif how2search == 'greater_equals_than': + elif how2search == "greater_equals_than": col_filters.append(column.__ge__(value)) - elif how2search == 'smaller_than': + elif how2search == "smaller_than": col_filters.append(column.__lt__(value)) - elif how2search == 'smaller_equals_than': + elif how2search == "smaller_equals_than": col_filters.append(column.__le__(value)) - elif how2search == 'not_equals': + elif how2search == "not_equals": col_filters.append(column != value) - elif how2search == 'exclude': + elif how2search == "exclude": col_filters.append(not_(column.like(value))) - elif how2search == 'between': + elif how2search == "between": found_2_values = re.search( r"(?P[+-]?\d+(\.\d+)?).*?[-,;:/].*?(?P[+-]?\d+(\.\d+)?)", - value + value, ) if found_2_values: values = sorted(found_2_values.groupdict().values(), reverse=True) diff --git a/ebel/web/api/ebel/v1/bel.py b/ebel/web/api/ebel/v1/bel.py index 2a2064a..23c8d72 100644 --- a/ebel/web/api/ebel/v1/bel.py +++ b/ebel/web/api/ebel/v1/bel.py @@ -1,26 +1,26 @@ """Generic BEL relation API methods.""" +import cgi import io import json import re -import cgi -import requests -import xmltodict -import pandas as pd - -from enum import Enum -from collections import namedtuple, defaultdict, Counter +from collections import Counter, defaultdict, namedtuple from copy import deepcopy +from enum import Enum from math import ceil from pathlib import Path -from typing import List, Optional, Dict, Set, NamedTuple, Any, Union, Tuple +from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Union -from flask import request, make_response, send_from_directory, send_file +import pandas as pd +import requests +import xmltodict +from flask import make_response, request, send_file, send_from_directory from graphviz import Digraph from ebel import Bel +from ebel.manager.orientdb.odb_structure import (get_columns, + get_node_view_labels) from ebel.validate import validate_bel_file -from ebel.manager.orientdb.odb_structure import get_columns, get_node_view_labels -from ebel.web.api.ebel.v1 import _get_pagination, DataType, OrientDbSqlOperator +from ebel.web.api.ebel.v1 import DataType, OrientDbSqlOperator, _get_pagination PathLengthDict = Dict[int, List[Dict[str, list]]] PathLength = int @@ -46,31 +46,31 @@ def has_name(cls, name) -> bool: return any([x.name == name for x in cls]) -BELishEdge = namedtuple('Edge', ['name', 'direction', 'params_str']) +BELishEdge = namedtuple("Edge", ["name", "direction", "params_str"]) edge_colours = { - 'increases': "limegreen", - 'directly_increases': "springgreen4", - 'decreases': "orangered", - 'directly_decreases': "red3", - 'rate_limiting_step_of': "lightslateblue", - 'regulates': "lightblue3", - 'causes_no_change': "yellow2", - 'positive_correlation': "darkolivegreen4", - 'negative_correlation': "coral3", + "increases": "limegreen", + "directly_increases": "springgreen4", + "decreases": "orangered", + "directly_decreases": "red3", + "rate_limiting_step_of": "lightslateblue", + "regulates": "lightblue3", + "causes_no_change": "yellow2", + "positive_correlation": "darkolivegreen4", + "negative_correlation": "coral3", } node_colours = { - 'protein': "lightblue1", - 'complex': "khaki1", - 'component': "aquamarine", - 'rna': "goldenrod1", - 'gene': "lightslateblue", - 'activity': "palegreen3", - 'abundance': 'darksalmon', - 'pathology': "palegreen", - 'drug_db': "yellow1", - 'biological_process': "snow" + "protein": "lightblue1", + "complex": "khaki1", + "component": "aquamarine", + "rna": "goldenrod1", + "gene": "lightslateblue", + "activity": "palegreen3", + "abundance": "darksalmon", + "pathology": "palegreen", + "drug_db": "yellow1", + "biological_process": "snow", } @@ -104,14 +104,14 @@ def validate_uploaded_bel_file(file, errorOutput: str = "download", forceJson: b response = send_file( return_data, as_attachment=True, - mimetype='application/vnd.ms-excel', + mimetype="application/vnd.ms-excel", attachment_filename=error_report_file, ) elif errorOutput == "json": error_df = pd.read_excel(error_report_file, index_col=[0]) # content = error_df.to_json(orient='records') - content = error_df.to_dict(orient='records') + content = error_df.to_dict(orient="records") response = {"type": "error-report", "format": "json", "content": content} elif errorOutput == "html": @@ -138,14 +138,16 @@ def validate_uploaded_bel_file(file, errorOutput: str = "download", forceJson: b class Column: """Column definition class.""" - def __init__(self, - form_name: str, - column: str, - sql_operator: OrientDbSqlOperator = OrientDbSqlOperator.EQUALS, - data_type: DataType = DataType.STRING, - value: str = None, - show_in_results: bool = True, - switch_where_terms=False): + def __init__( + self, + form_name: str, + column: str, + sql_operator: OrientDbSqlOperator = OrientDbSqlOperator.EQUALS, + data_type: DataType = DataType.STRING, + value: str = None, + show_in_results: bool = True, + switch_where_terms=False, + ): """Init method for column. Parameters @@ -180,34 +182,63 @@ def __str__(self): bel_relation_default_columns: List[Column] = [ - Column('subject_rid', 'out.@rid'), - Column('subject_node_class', 'out.@class'), - Column('subject_namespace', 'out.namespace'), - Column('subject_name', 'out.name', OrientDbSqlOperator.LIKE), - Column('subject_bel', 'out.bel', OrientDbSqlOperator.LIKE), - Column('subject_gene_symbol_involved_in', 'out.involved_genes', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('subject_other_involved_in', 'out.involved_other', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('relation_rid', '@rid'), - Column('relation', '@class'), - Column('evidence', 'evidence', OrientDbSqlOperator.LIKE), - Column('citation_full_journal_name', 'citation.full_journal_name', OrientDbSqlOperator.LIKE), - Column('citation_pub_date', 'citation.pub_date'), - Column('citation_pub_year', 'citation.pub_year'), - Column('citation_last_author', 'citation.last_author', OrientDbSqlOperator.LIKE), - Column('citation_type', 'citation.type'), - Column('author_in_author_list', 'citation.author_list', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('title', 'citation.title', OrientDbSqlOperator.LIKE), - Column('doi', 'citation.doi'), - Column('object_rid', 'in.@rid'), - Column('object_node_class', 'in.@class'), - Column('object_namespace', 'in.namespace'), - Column('object_name', 'in.name', OrientDbSqlOperator.LIKE), - Column('object_bel', 'in.bel', OrientDbSqlOperator.LIKE), - Column('object_gene_symbol_involved_in', 'in.involved_genes', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('object_other_involved_in', 'in.involved_other', OrientDbSqlOperator.IN, DataType.LIST_STRING), + Column("subject_rid", "out.@rid"), + Column("subject_node_class", "out.@class"), + Column("subject_namespace", "out.namespace"), + Column("subject_name", "out.name", OrientDbSqlOperator.LIKE), + Column("subject_bel", "out.bel", OrientDbSqlOperator.LIKE), + Column( + "subject_gene_symbol_involved_in", + "out.involved_genes", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column( + "subject_other_involved_in", + "out.involved_other", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column("relation_rid", "@rid"), + Column("relation", "@class"), + Column("evidence", "evidence", OrientDbSqlOperator.LIKE), + Column( + "citation_full_journal_name", + "citation.full_journal_name", + OrientDbSqlOperator.LIKE, + ), + Column("citation_pub_date", "citation.pub_date"), + Column("citation_pub_year", "citation.pub_year"), + Column("citation_last_author", "citation.last_author", OrientDbSqlOperator.LIKE), + Column("citation_type", "citation.type"), + Column( + "author_in_author_list", + "citation.author_list", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column("title", "citation.title", OrientDbSqlOperator.LIKE), + Column("doi", "citation.doi"), + Column("object_rid", "in.@rid"), + Column("object_node_class", "in.@class"), + Column("object_namespace", "in.namespace"), + Column("object_name", "in.name", OrientDbSqlOperator.LIKE), + Column("object_bel", "in.bel", OrientDbSqlOperator.LIKE), + Column( + "object_gene_symbol_involved_in", + "in.involved_genes", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column( + "object_other_involved_in", + "in.involved_other", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), ] -Pagination = namedtuple('Pagination', ['page', 'page_size', 'skip']) +Pagination = namedtuple("Pagination", ["page", "page_size", "skip"]) class Query: @@ -223,24 +254,24 @@ def __init__(self, odb_class: str, columns: List[Column]): @staticmethod def get_pagination() -> Pagination: """Separate results into pages of a specific length.""" - page_size = request.args.get('page_size', '10') + page_size = request.args.get("page_size", "10") page_size = int(page_size) if re.search(r"^\d+$", page_size) else 10 page_size = 10 if page_size >= 100 else page_size - page = request.args.get('page', '1') + page = request.args.get("page", "1") page = int(page) if re.search(r"^\d+$", page) else 1 skip = (page - 1) * page_size return Pagination(page=page, page_size=page_size, skip=skip) def get_where(self): """Generic filter execution method.""" - where = '' + where = "" wheres = [] for col in self.columns: if col.value: - if col.column.endswith('@rid'): + if col.column.endswith("@rid"): if "," in col.value: rids = [x.strip() for x in col.value.split(",") if re.search(r"^#\d+:\d+$", x.strip())] - rids_str = "[" + ','.join(rids) + "]" + rids_str = "[" + ",".join(rids) + "]" wheres.append(f"{col.column} in {rids_str}") else: rid = col.value.strip() @@ -252,22 +283,26 @@ def get_where(self): else: value = col.value - if col.data_type in [DataType.LIST_STRING, DataType.LIST_NUMBER, DataType.LIST_INTEGER]: - wheres.append(f'{value} {col.sql_operator.value} {col.column}') + if col.data_type in [ + DataType.LIST_STRING, + DataType.LIST_NUMBER, + DataType.LIST_INTEGER, + ]: + wheres.append(f"{value} {col.sql_operator.value} {col.column}") else: if col.switch_where_terms: - wheres.append(f'{value} {col.sql_operator.value} {col.column}') + wheres.append(f"{value} {col.sql_operator.value} {col.column}") else: - wheres.append(f'{col.column} {col.sql_operator.value} {value}') + wheres.append(f"{col.column} {col.sql_operator.value} {value}") if wheres: - where = " WHERE " + ' AND '.join(wheres) + where = " WHERE " + " AND ".join(wheres) return where @property def sql(self): """Generic sql execution method.""" select = "SELECT " - select += ', '.join([f"{sw.display_column} as {sw.form_name}" for sw in self.columns if sw.show_in_results]) + select += ", ".join([f"{sw.display_column} as {sw.form_name}" for sw in self.columns if sw.show_in_results]) select += " FROM " + self.odb_class sql = select + self.where return sql @@ -275,7 +310,7 @@ def sql(self): def get_number_of_results(self): """Count number of results.""" sql = "SELECT count(*) FROM " + self.odb_class + self.where - return self.ebel.query_get_dict(sql)[0]['count'] + return self.ebel.query_get_dict(sql)[0]["count"] def get_result(self, pagination: Optional[Pagination] = None): """Return total number of results.""" @@ -284,7 +319,7 @@ def get_result(self, pagination: Optional[Pagination] = None): else: p = self.get_pagination() if not (p.page and p.page_size): - return {'error': "Please add page and page_size to your method."} + return {"error": "Please add page and page_size to your method."} number_of_results = self.get_number_of_results() pages = ceil(number_of_results / p.page_size) @@ -292,11 +327,11 @@ def get_result(self, pagination: Optional[Pagination] = None): # print(sql_paginated) return { - 'page': p.page, - 'page_size': p.page_size, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': [x for x in self.ebel.query_get_dict(sql_paginated)] + "page": p.page, + "page_size": p.page_size, + "number_of_results": number_of_results, + "pages": pages, + "results": [x for x in self.ebel.query_get_dict(sql_paginated)], } @@ -318,8 +353,15 @@ def _get_where_by_how(column: str, value: str, how_to_search: str): def get_node_class_bel_name_ns(): """Return node class, BEL name, and namespace.""" - default_args = ('bel', 'node_name', 'namespace', 'node_class', 'how_bel', 'how_name') - args = {x: '' for x in default_args} + default_args = ( + "bel", + "node_name", + "namespace", + "node_class", + "how_bel", + "how_name", + ) + args = {x: "" for x in default_args} filtered_request_args = {k: v for k, v in request.args.items() if k in default_args} args.update(filtered_request_args) namespaces = _get_node_namespace_list(**args) @@ -327,69 +369,86 @@ def get_node_class_bel_name_ns(): suggested_node_names = _get_suggested_node_names(**args) suggested_bels = _get_suggested_bels(**args) return { - 'namespaces': namespaces, - 'node_classes': node_classes, - 'suggested_node_names': suggested_node_names, - 'suggested_bels': suggested_bels + "namespaces": namespaces, + "node_classes": node_classes, + "suggested_node_names": suggested_node_names, + "suggested_bels": suggested_bels, } -def _get_suggested_bels(bel: str, node_name: str, node_class: str, namespace: str, how_name: str, how_bel: str): - node_class = node_class if node_class else 'bel' +def _get_suggested_bels( + bel: str, + node_name: str, + node_class: str, + namespace: str, + how_name: str, + how_bel: str, +): + node_class = node_class if node_class else "bel" sql = f"Select bel from {node_class}" where = [] if bel: - where.append(_get_where_by_how(column='bel', value=bel, how_to_search=how_bel)) + where.append(_get_where_by_how(column="bel", value=bel, how_to_search=how_bel)) if namespace: where.append(f"namespace = '{namespace}'") if node_name: - where.append(_get_where_by_how('name', node_name, how_name)) + where.append(_get_where_by_how("name", node_name, how_name)) if where: - sql += " where " + ' and '.join(where) + sql += " where " + " and ".join(where) sql += " order by bel limit 30" print(sql) - return [y for y in [x.oRecordData.get('bel') for x in Bel().execute(sql)] if y is not None] + return [y for y in [x.oRecordData.get("bel") for x in Bel().execute(sql)] if y is not None] -def _get_suggested_node_names(bel: str, node_name: str, node_class: str, namespace: str, how_name: str, how_bel: str): - node_class = node_class if node_class else 'bel' +def _get_suggested_node_names( + bel: str, + node_name: str, + node_class: str, + namespace: str, + how_name: str, + how_bel: str, +): + node_class = node_class if node_class else "bel" sql = f"Select name from {node_class} where " where = [] - where.append(_get_where_by_how(column='name', value=node_name, how_to_search=how_name)) + where.append(_get_where_by_how(column="name", value=node_name, how_to_search=how_name)) if namespace: where.append(f"namespace = '{namespace}'") if bel: - where.append(_get_where_by_how('bel', bel, how_bel)) + where.append(_get_where_by_how("bel", bel, how_bel)) - sql += ' and '.join(where) + " group by name order by name limit 30" + sql += " and ".join(where) + " group by name order by name limit 30" - return [x.oRecordData['name'] for x in Bel().execute(sql)] + return [x.oRecordData["name"] for x in Bel().execute(sql)] -def _get_node_namespace_list(bel: str, node_name: str, namespace: str, node_class: str, how_name: str, how_bel: str): +def _get_node_namespace_list( + bel: str, + node_name: str, + namespace: str, + node_class: str, + how_name: str, + how_bel: str, +): """Get first names from BEL nodes (by namespace and node_class).""" if not namespace: - node_class = node_class if node_class else 'bel' + node_class = node_class if node_class else "bel" sql = f"Select namespace from {node_class} where namespace is not null " if node_name: - sql += " and " + _get_where_by_how(column='name', - value=node_name, - how_to_search=how_name) + sql += " and " + _get_where_by_how(column="name", value=node_name, how_to_search=how_name) if bel: - sql += " and " + _get_where_by_how(column='bel', - value=bel, - how_to_search=how_bel) + sql += " and " + _get_where_by_how(column="bel", value=bel, how_to_search=how_bel) sql += " group by namespace order by namespace" # print(sql) - return [x.oRecordData['namespace'] for x in Bel().execute(sql)] + return [x.oRecordData["namespace"] for x in Bel().execute(sql)] else: return [namespace] @@ -400,19 +459,15 @@ def _get_node_class_list(bel: str, node_name: str, node_class: str, namespace: s where = [] if node_name or namespace or bel: if node_name: - where.append(_get_where_by_how(column='name', - value=node_name, - how_to_search=how_name)) + where.append(_get_where_by_how(column="name", value=node_name, how_to_search=how_name)) if namespace: where.append(f"namespace = '{namespace}'") if bel: - where.append(_get_where_by_how(column='bel', - value=bel, - how_to_search=how_bel)) - sql += " where " + ' and '.join(where) + where.append(_get_where_by_how(column="bel", value=bel, how_to_search=how_bel)) + sql += " where " + " and ".join(where) sql += " group by @class order by @class" # print(sql) - return [x.oRecordData['node_class'] for x in Bel().execute(sql)] + return [x.oRecordData["node_class"] for x in Bel().execute(sql)] else: return [node_class] @@ -421,70 +476,95 @@ def get_namespaces(): """Get ordered list of namespaces.""" sql = "Select distinct(namespace) as namespace from bel where namespace is not null order by namespace" print(sql) - return [x.oRecordData['namespace'] for x in Bel().execute(sql)] + return [x.oRecordData["namespace"] for x in Bel().execute(sql)] def get_node_classes(): """Get ordered list of node classes.""" sql = "Select distinct(@class) as node_class from bel order by node_class" print(sql) - return [x.oRecordData['node_class'] for x in Bel().execute(sql)] + return [x.oRecordData["node_class"] for x in Bel().execute(sql)] def get_bel_relations_by_pmid(): """Return BEL relations by PMID.""" columns: List[Column] = [ - Column('subject_rid', 'out.@rid'), - Column('subject_node_class', 'out.@class'), - Column('subject_namespace', 'out.namespace'), - Column('subject_name', 'out.name', OrientDbSqlOperator.LIKE), - Column('subject_bel', 'out.bel', OrientDbSqlOperator.LIKE), - Column('subject_gene_symbol_involved_in', 'out.involved_genes', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('subject_other_involved_in', 'out.involved_other', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('relation_rid', '@rid'), - Column('relation', '@class'), - Column('evidence', 'evidence', OrientDbSqlOperator.LIKE), - Column('object_rid', 'in.@rid'), - Column('object_node_class', 'in.@class'), - Column('object_namespace', 'in.namespace'), - Column('object_name', 'in.name', OrientDbSqlOperator.LIKE), - Column('object_bel', 'in.bel', OrientDbSqlOperator.LIKE), - Column('object_gene_symbol_involved_in', 'in.involved_genes', OrientDbSqlOperator.IN, DataType.LIST_STRING), - Column('object_other_involved_in', 'in.involved_other', OrientDbSqlOperator.IN, DataType.LIST_STRING), + Column("subject_rid", "out.@rid"), + Column("subject_node_class", "out.@class"), + Column("subject_namespace", "out.namespace"), + Column("subject_name", "out.name", OrientDbSqlOperator.LIKE), + Column("subject_bel", "out.bel", OrientDbSqlOperator.LIKE), + Column( + "subject_gene_symbol_involved_in", + "out.involved_genes", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column( + "subject_other_involved_in", + "out.involved_other", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column("relation_rid", "@rid"), + Column("relation", "@class"), + Column("evidence", "evidence", OrientDbSqlOperator.LIKE), + Column("object_rid", "in.@rid"), + Column("object_node_class", "in.@class"), + Column("object_namespace", "in.namespace"), + Column("object_name", "in.name", OrientDbSqlOperator.LIKE), + Column("object_bel", "in.bel", OrientDbSqlOperator.LIKE), + Column( + "object_gene_symbol_involved_in", + "in.involved_genes", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column( + "object_other_involved_in", + "in.involved_other", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), ] - column_pmid = Column('pmid', 'pmid', data_type=DataType.INTEGER, value=request.args.get('pmid')) + column_pmid = Column("pmid", "pmid", data_type=DataType.INTEGER, value=request.args.get("pmid")) columns.append(column_pmid) - sql_builder = Query('bel_relation', columns) + sql_builder = Query("bel_relation", columns) return sql_builder.get_result(Pagination(1, 1000, 0)) def get_edge_by_annotation() -> list: """Return list of edges with a given annotation.""" columns = deepcopy(bel_relation_default_columns) - annotation_key = request.args.get('annotation_key') - annotation_term = request.args.get('annotation_term') + annotation_key = request.args.get("annotation_key") + annotation_term = request.args.get("annotation_term") if annotation_key and annotation_term: - column = Column('annotation_key', f"annotation['{annotation_key}']", - sql_operator=OrientDbSqlOperator.IN, value=annotation_term, switch_where_terms=True) + column = Column( + "annotation_key", + f"annotation['{annotation_key}']", + sql_operator=OrientDbSqlOperator.IN, + value=annotation_term, + switch_where_terms=True, + ) columns.append(column) - sql_builder = Query('bel_relation', columns) + sql_builder = Query("bel_relation", columns) return sql_builder.get_result() def get_edge_rids(): """Get edge OrientDB rids.""" - subject_rid = request.args.get('subject_rid') - relation_rid = request.args.get('relation_rid') - object_rid = request.args.get('object_rid') - document_rid = request.args.get('document_rid') + subject_rid = request.args.get("subject_rid") + relation_rid = request.args.get("relation_rid") + object_rid = request.args.get("object_rid") + document_rid = request.args.get("document_rid") columns = [ - Column(form_name='subject_rid', column='in.@rid', value=subject_rid), - Column(form_name='relation_rid', column='@rid', value=relation_rid), - Column(form_name='object_rid', column='out.@rid', value=object_rid), - Column(form_name='document_rid', column='document.@rid', value=document_rid), + Column(form_name="subject_rid", column="in.@rid", value=subject_rid), + Column(form_name="relation_rid", column="@rid", value=relation_rid), + Column(form_name="object_rid", column="out.@rid", value=object_rid), + Column(form_name="document_rid", column="document.@rid", value=document_rid), ] - sql_builder = Query('bel_relation', columns) + sql_builder = Query("bel_relation", columns) return sql_builder.get_result() @@ -498,27 +578,31 @@ def get_annotation_keys(): def get_mesh_terms_statistics_by_node_rid(): """Return MeSH term stats by node rID.""" - rid = request.args.get('node_rid') - direction = request.args.get('direction') - limit = request.args.get('limit') + rid = request.args.get("node_rid") + direction = request.args.get("direction") + limit = request.args.get("limit") sql = f"""Select list(annotation.mesh) as mesh_terms FROM (traverse {direction}E() FROM {rid} MAXDEPTH 1) where @rid!={rid} and annotation.mesh is not null""" res = Bel().query_get_dict(sql) - if 'mesh_terms' in res[0]: - res_dict = Counter(res[0]['mesh_terms']) - mesh_counter_list = [{'mesh_term': x[0], 'count': x[1]} - for x in sorted(res_dict.items(), key=lambda item: item[1], reverse=True)] - return mesh_counter_list[:int(limit)] if limit else mesh_counter_list + if "mesh_terms" in res[0]: + res_dict = Counter(res[0]["mesh_terms"]) + mesh_counter_list = [ + {"mesh_term": x[0], "count": x[1]} + for x in sorted(res_dict.items(), key=lambda item: item[1], reverse=True) + ] + return mesh_counter_list[: int(limit)] if limit else mesh_counter_list return [] def get_annotation_terms(): """Get the annotation terms.""" - annotation_key = request.args.get('annotation_key') + annotation_key = request.args.get("annotation_key") if annotation_key: - sql = "Select value as annotation_term, count(*) as number_of_edges from " \ - f"(Select expand(annotation['{annotation_key}']) as mesh from bel_relation " \ + sql = ( + "Select value as annotation_term, count(*) as number_of_edges from " + f"(Select expand(annotation['{annotation_key}']) as mesh from bel_relation " "where annotation.mesh is not null) group by value order by number_of_edges desc" + ) return [x.oRecordData for x in Bel().execute(sql)] @@ -529,7 +613,7 @@ def get_edges(): for column in columns: column.set_search_term(request.args.get(column.form_name)) - relation = request.args.get('relation', 'bel_relation') + relation = request.args.get("relation", "bel_relation") sql_builder = Query(relation, columns) return sql_builder.get_result() @@ -539,57 +623,65 @@ def get_nodes() -> dict: """Return list of nodes with a given namespace.""" b = Bel() where_list: List[str] = [] - params = {k: v for k, v in request.args.items() if k in ['namespace', 'name'] and v} + params = {k: v for k, v in request.args.items() if k in ["namespace", "name"] and v} - if request.args.get('pure') == 'true': + if request.args.get("pure") == "true": params.update(pure=True) - conn2bel_rel = request.args.get('connected_to_bel_relation') + conn2bel_rel = request.args.get("connected_to_bel_relation") if conn2bel_rel: - conn2bel_rel_dir = request.args.get('connected_to_bel_relation_direction', 'both') + conn2bel_rel_dir = request.args.get("connected_to_bel_relation_direction", "both") where_list.append(f"{conn2bel_rel_dir}('{conn2bel_rel}').size()>0") - conn2ebel_rel = request.args.get('connected_to_ebel_relation') + conn2ebel_rel = request.args.get("connected_to_ebel_relation") if conn2ebel_rel: - conn2ebel_rel_dir = request.args.get('connected_to_ebel_relation_direction', 'both') + conn2ebel_rel_dir = request.args.get("connected_to_ebel_relation_direction", "both") where_list.append(f"{conn2ebel_rel_dir}('{conn2ebel_rel}').size()>0") - node_class = request.args.get('node_class') + node_class = request.args.get("node_class") p = _get_pagination() - number_of_results = b.query_class(class_name=node_class, - columns=['count(*)'], - with_rid=False, - **params)[0]['count'] + number_of_results = b.query_class(class_name=node_class, columns=["count(*)"], with_rid=False, **params)[0][ + "count" + ] pages = ceil(number_of_results / p.page_size) - results = b.query_class(class_name=node_class, - columns=['namespace', 'name', 'bel', 'pure', 'involved_genes', 'involved_other'], - skip=p.skip, - limit=p.page_size, - where_list=tuple(where_list), - print_sql=True, - **params) + results = b.query_class( + class_name=node_class, + columns=[ + "namespace", + "name", + "bel", + "pure", + "involved_genes", + "involved_other", + ], + skip=p.skip, + limit=p.page_size, + where_list=tuple(where_list), + print_sql=True, + **params, + ) return { - 'page': p.page, - 'page_size': p.page_size, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': results + "page": p.page, + "page_size": p.page_size, + "number_of_results": number_of_results, + "pages": pages, + "results": results, } def _get_rid() -> Optional[str]: """Get rID.""" - rid = request.args.get('rid') + rid = request.args.get("rid") if rid: rid = rid.strip() - if re.search(r'#\d+:\d+', rid): + if re.search(r"#\d+:\d+", rid): return rid def get_edge_statistics_by_rid(): """Return edge statistics for given rID.""" - rid = request.args.get('rid') - direction = request.args.get('direction', 'both') # in, out or both + rid = request.args.get("rid") + direction = request.args.get("direction", "both") # in, out or both sql = "Select @class, count(*) from (traverse {dir}E() FROM {rid} MAXDEPTH 1) where @rid!={rid} group by @class" res = Bel().query_get_dict(sql.format(dir=direction, rid=rid)) return res @@ -615,20 +707,22 @@ def get_adjacent_nodes_by_rid() -> list: # d := direction # od := oposite direction rid = _get_rid() - relation = request.args.get('relation', 'bel_relation') - sql_temp = "Select '{d}' as direction, @rid.asString() as edge_rid, @class.asString() " \ - "as edge_class, {d}.@rid.asString() as node_rid, {d}.@class.asString()as node_class , " \ - "{d}.bel as bel, {d}.name as name, {d}.namespace as namespace, {d}.involved_genes as "\ - f"involved_genes, {{d}}.involved_other as involved_other from {relation} " \ + relation = request.args.get("relation", "bel_relation") + sql_temp = ( + "Select '{d}' as direction, @rid.asString() as edge_rid, @class.asString() " + "as edge_class, {d}.@rid.asString() as node_rid, {d}.@class.asString()as node_class , " + "{d}.bel as bel, {d}.name as name, {d}.namespace as namespace, {d}.involved_genes as " + f"involved_genes, {{d}}.involved_other as involved_other from {relation} " f"where {{od}}.@rid = {rid}" + ) - direction = request.args.get('direction', 'both') + direction = request.args.get("direction", "both") if rid: - sql_in = sql_temp.format(d='in', od='out') - sql_out = sql_temp.format(d='out', od='in') - if direction == 'in': + sql_in = sql_temp.format(d="in", od="out") + sql_out = sql_temp.format(d="out", od="in") + if direction == "in": sql = sql_in - elif direction == 'out': + elif direction == "out": sql = sql_out else: sql = f"select expand($c) let $a = ({sql_in}), $b = ({sql_out}), $c = unionAll( $a, $b )" @@ -638,50 +732,52 @@ def get_adjacent_nodes_by_rid() -> list: def get_number_of_edges() -> int: """Return the number of edges.""" b = Bel() - relation = request.args.get('relation', 'E') + relation = request.args.get("relation", "E") r = b.execute(f"Select count(*) as number_of_edges from {relation} limit 1") - return r[0].oRecordData['number_of_edges'] + return r[0].oRecordData["number_of_edges"] def get_citation_by_pmid() -> dict: """Return the number of edges.""" b = Bel() - pmid = request.args.get('pmid') + pmid = request.args.get("pmid") r = b.execute(f"Select citation from bel_relation where pmid = {pmid} limit 1") - return r[0].oRecordData['citation'] + return r[0].oRecordData["citation"] def get_abstract_by_pmid(): """Return abstract by PMID.""" - pmid = request.args.get('pmid') + pmid = request.args.get("pmid") url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=XML&rettype=abstract" r = requests.get(url.format(pmid=pmid)) d = xmltodict.parse(r.text) - return d['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText'] + return d["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] def get_number_of_nodes() -> int: """Return the number of edges.""" b = Bel() - node_class = request.args.get('node_class', 'V') - pure = request.args.get('pure') - where_pure = "where pure = true" if pure else '' + node_class = request.args.get("node_class", "V") + pure = request.args.get("pure") + where_pure = "where pure = true" if pure else "" sql = f"Select count(*) as number_of_nodes from {node_class} {where_pure} limit 1" r = b.execute(sql) - return r[0].oRecordData['number_of_nodes'] + return r[0].oRecordData["number_of_nodes"] def get_pure_rid() -> Optional[str]: """Return None or the rID from the node class.""" b = Bel() - node_class = request.args.get('node_class', 'protein') - namespace = request.args.get('namespace', 'HGNC') - name = request.args.get('name') + node_class = request.args.get("node_class", "protein") + namespace = request.args.get("namespace", "HGNC") + name = request.args.get("name") if name: - sql = f"Select @rid.asString() as rid from {node_class} " \ - f"where name='{name}' and pure=true and namespace='{namespace}' limit 1" - return b.execute(sql)[0].oRecordData['rid'] + sql = ( + f"Select @rid.asString() as rid from {node_class} " + f"where name='{name}' and pure=true and namespace='{namespace}' limit 1" + ) + return b.execute(sql)[0].oRecordData["rid"] class Position(EnumExtension): @@ -695,24 +791,30 @@ class Position(EnumExtension): class SearchType(EnumExtension): """SearchType constants.""" - EXACT = 'exact' - CONTAINS = 'contains' - CASE_SENSITIVE = 'case_insensitive' - STARTS_WITH = 'starts_with' - ENDS_WITH = 'ends_with' - GREATER_THAN = 'greater_than' - GREATER_OR_EQUALS_THAN = 'greater_or_equals_than' - SMALLER_THAN = 'smaller_than' - SMALLER_OR_EQUALS_THAN = 'smaller_or_equals_than' + EXACT = "exact" + CONTAINS = "contains" + CASE_SENSITIVE = "case_insensitive" + STARTS_WITH = "starts_with" + ENDS_WITH = "ends_with" + GREATER_THAN = "greater_than" + GREATER_OR_EQUALS_THAN = "greater_or_equals_than" + SMALLER_THAN = "smaller_than" + SMALLER_OR_EQUALS_THAN = "smaller_or_equals_than" class MatchEdge: """Class to construct the edge portions of a MATCH query.""" - def __init__(self, edge_class: str, multiple_edge_classes: str, mesh_terms: List[str], pmids: List[int]): + def __init__( + self, + edge_class: str, + multiple_edge_classes: str, + mesh_terms: List[str], + pmids: List[int], + ): """Init method.""" self.position: Optional[Position] = None - self.node_class = 'bel' + self.node_class = "bel" self.edge_class = edge_class self.pmids = pmids self.mesh_terms = mesh_terms @@ -726,12 +828,12 @@ def set_last(self, node_class: Optional[str]): def get_edge(self, alias_number: int) -> str: """Return edge based on alias number.""" - mesh_or = '' + mesh_or = "" if self.mesh_terms: - mesh_or = " OR ".join(["'" + x.replace("'", '') + "' in annotation.mesh" for x in self.mesh_terms]) + mesh_or = " OR ".join(["'" + x.replace("'", "") + "' in annotation.mesh" for x in self.mesh_terms]) mesh_or = f"({mesh_or})" - pmids_in = '' + pmids_in = "" if self.pmids: if len(self.pmids) == 1: pmids_in = f"pmid = {self.pmids[0]}" @@ -739,9 +841,9 @@ def get_edge(self, alias_number: int) -> str: pmids_in = "pmid in " + str(self.pmids) pmids_in = f"{pmids_in}" - where = ' AND '.join([x for x in [mesh_or, pmids_in] if x]) + where = " AND ".join([x for x in [mesh_or, pmids_in] if x]) - where_str = f"where:({where}), " if any([mesh_or, pmids_in]) else '' + where_str = f"where:({where}), " if any([mesh_or, pmids_in]) else "" e_class_multi, e_class_single = self.get_edge_classes() return f".outE({e_class_multi}){{{e_class_single}{where_str}as:e{alias_number}}}.inV()" @@ -749,15 +851,15 @@ def get_edge(self, alias_number: int) -> str: def get_edge_classes(self): """Return edge classes of match query.""" """""" - e_class_single = '' - e_class_multi = '' + e_class_single = "" + e_class_multi = "" if isinstance(self.multiple_edge_classes, str) and self.multiple_edge_classes.strip(): - edge_classes = [x.strip() for x in self.multiple_edge_classes.split(',') if x.strip()] + edge_classes = [x.strip() for x in self.multiple_edge_classes.split(",") if x.strip()] if len(edge_classes) == 1: e_class_single = f"class:{edge_classes[0]}, " elif len(edge_classes) > 1: - edge_classes = ["'" + re.sub(r'\W+', "", x) + "'" for x in edge_classes] + edge_classes = ["'" + re.sub(r"\W+", "", x) + "'" for x in edge_classes] e_class_multi = ",".join(edge_classes) elif isinstance(self.edge_class, str) and self.edge_class.strip(): @@ -780,14 +882,16 @@ def __init__(self): self.how_name = None self.how_bel = None - def set_outside(self, - position: Position, - name: Optional[str] = None, - node_class: Optional[str] = None, - namespace: Optional[str] = None, - bel: Optional[str] = None, - how_name: Optional[str] = SearchType.EXACT.value, - how_bel: Optional[str] = SearchType.EXACT.value): + def set_outside( + self, + position: Position, + name: Optional[str] = None, + node_class: Optional[str] = None, + namespace: Optional[str] = None, + bel: Optional[str] = None, + how_name: Optional[str] = SearchType.EXACT.value, + how_bel: Optional[str] = SearchType.EXACT.value, + ): """Assign attributes to the "outside" position.""" self.position = position if name: @@ -815,14 +919,14 @@ def get_node(self, alias_number: int) -> str: """Return node based on alias number.""" print(f"hows: \n\tposition{self.position} \n\thow_name: {self.how_name}, \n\thow_bel: {self.how_bel}") namespace = f"namespace='{self.namespace}'" - name = _get_where_by_how('name', self.name, self.how_name) - bel = _get_where_by_how('bel', self.bel, self.how_bel) + name = _get_where_by_how("name", self.name, self.how_name) + bel = _get_where_by_how("bel", self.bel, self.how_bel) name_involved = f"('{self.name}' in involved_genes OR '{self.name}' in involved_other)" involved_genes = "involved_genes.size()>0" not_like_node = "$matched.n{}!=$currentMatch" where_inside_list = [] - where_str = '' + where_str = "" if self.position in (Position.FIRST, Position.LAST): where_first_last = [] @@ -831,30 +935,30 @@ def get_node(self, alias_number: int) -> str: if self.namespace and not self.name: where_first_last.append(namespace) elif self.name and not self.namespace: - if self.node_class in ['gene', 'rna', 'protein']: + if self.node_class in ["gene", "rna", "protein"]: where_first_last.append(name) else: where_first_last.append(name_involved) elif self.name and self.namespace: - where_first_last.append(f'{name} AND {namespace}') + where_first_last.append(f"{name} AND {namespace}") where_str = " AND ".join(where_first_last) if self.position == Position.LAST: where_inside_list.append(not_like_node.format(1)) if self.gene_path and not any([self.node_class, self.name, self.namespace]): where_inside_list.append(involved_genes) - where_str = ' AND '.join([x for x in ([where_str] + where_inside_list) if x]) + where_str = " AND ".join([x for x in ([where_str] + where_inside_list) if x]) alias = f"as:n{alias_number}" - where = f"where:({where_str})" if where_str else '' + where = f"where:({where_str})" if where_str else "" node_class = self.get_node_class() - node_query_str = ', '.join([x for x in [node_class, where, alias] if x]) + node_query_str = ", ".join([x for x in [node_class, where, alias] if x]) return "{" + node_query_str + "}" def get_node_class(self): """Return node class of match query.""" - node_class = '' + node_class = "" if self.node_class: node_class = f"class:{self.node_class}" return node_class @@ -863,8 +967,8 @@ def get_node_class(self): class GraphType(EnumExtension): """Not sure why there is a class for defining constants.""" - NODES = 'nodes' - EDGES = 'edges' + NODES = "nodes" + EDGES = "edges" class PathsResult(NamedTuple): @@ -878,30 +982,32 @@ class PathsResult(NamedTuple): class PathQuery: """Class for constructing a path-based query.""" - def __init__(self, - start_name: str, - end_name: str, - min_length: int, - max_length: int, - start_how_name: Optional[str] = None, - end_how_name: Optional[str] = None, - start_class: Optional[str] = None, - end_class: Optional[str] = None, - start_ns: Optional[str] = None, - end_ns: Optional[str] = None, - start_bel: Optional[str] = None, - start_how_bel: Optional[str] = None, - end_bel: Optional[str] = None, - end_how_bel: Optional[str] = None, - gene_path: bool = False, - edge_class: Optional[str] = None, - multiple_edge_classes: Optional[str] = None, - inside_node_class: Optional[str] = None, - mesh_term: Optional[str] = None, - pmids: str = '', - belish: Optional[str] = None, - limit: int = 0, - skip: int = 0): + def __init__( + self, + start_name: str, + end_name: str, + min_length: int, + max_length: int, + start_how_name: Optional[str] = None, + end_how_name: Optional[str] = None, + start_class: Optional[str] = None, + end_class: Optional[str] = None, + start_ns: Optional[str] = None, + end_ns: Optional[str] = None, + start_bel: Optional[str] = None, + start_how_bel: Optional[str] = None, + end_bel: Optional[str] = None, + end_how_bel: Optional[str] = None, + gene_path: bool = False, + edge_class: Optional[str] = None, + multiple_edge_classes: Optional[str] = None, + inside_node_class: Optional[str] = None, + mesh_term: Optional[str] = None, + pmids: str = "", + belish: Optional[str] = None, + limit: int = 0, + skip: int = 0, + ): """Init method.""" self.multiple_edge_classes = multiple_edge_classes self.limit = limit @@ -914,8 +1020,8 @@ def __init__(self, if isinstance(self.skip, str) and self.skip.isnumeric(): self.skip = int(self.skip) - self.pmids = [int(x.strip()) for x in pmids.split(',') if x.strip().isdigit()] - self.mesh_terms = [x.strip() for x in mesh_term.split(';') if x.strip()] + self.pmids = [int(x.strip()) for x in pmids.split(",") if x.strip().isdigit()] + self.mesh_terms = [x.strip() for x in mesh_term.split(";") if x.strip()] self.execute = Bel().execute self.min_length = min_length self.max_length = max_length @@ -932,7 +1038,8 @@ def __init__(self, namespace=start_ns, bel=start_bel, how_name=start_how_name, - how_bel=start_how_bel) + how_bel=start_how_bel, + ) self.nodes[-1].set_outside( position=Position.LAST, @@ -941,30 +1048,35 @@ def __init__(self, namespace=end_ns, bel=end_bel, how_name=end_how_name, - how_bel=end_how_bel) + how_bel=end_how_bel, + ) - self.edges = [MatchEdge(self.edge_class, - self.multiple_edge_classes, - self.mesh_terms, - self.pmids) for _ in range(self.max_length)] + self.edges = [ + MatchEdge(self.edge_class, self.multiple_edge_classes, self.mesh_terms, self.pmids) + for _ in range(self.max_length) + ] self.edges[-1].set_last(end_class) # TODO: why not using normal edge_class? for node in self.nodes[1:-1]: node.set_inside(gene_path, inside_node_class) - self.too_many_paths = "With the path length of {} we found already more than " \ - f"{self.max_paths} pathways. Please specify you query (or set limit) and run again." + self.too_many_paths = ( + "With the path length of {} we found already more than " + f"{self.max_paths} pathways. Please specify you query (or set limit) and run again." + ) - self.too_many_edges = f"We found too many unique edges ({{}}, max allowed={self.max_unique_edges} ) with an " \ - f"allowed maximum of {self.max_paths} paths. Please specify you query and run again. " \ - "Decrease max path length, use limit or state start- and end-node more precisely." + self.too_many_edges = ( + f"We found too many unique edges ({{}}, max allowed={self.max_unique_edges} ) with an " + f"allowed maximum of {self.max_paths} paths. Please specify you query and run again. " + "Decrease max path length, use limit or state start- and end-node more precisely." + ) def get_query_str(self, number_of_edges): """Create query string by number of edges.""" query = "match " + self.nodes[0].get_node(1) - edges = self.edges[-1 * number_of_edges:] - nodes = self.nodes[-1 * (number_of_edges + 1):] + edges = self.edges[-1 * number_of_edges :] + nodes = self.nodes[-1 * (number_of_edges + 1) :] for i in range(1, number_of_edges + 1): query += edges[i - 1].get_edge(alias_number=i) + nodes[i].get_node(alias_number=i + 1) @@ -974,8 +1086,11 @@ def get_query_str(self, number_of_edges): @staticmethod def _get_unique_rids(graph_type: GraphType, path_length_dict: PathLengthDict) -> Set[Rid]: """Get unique node or edge rid set.""" - rids = {w for z in [[x for y in [en[graph_type.value] for en in v] for x in y] for _, v in - path_length_dict.items()] for w in z} + rids = { + w + for z in [[x for y in [en[graph_type.value] for en in v] for x in y] for _, v in path_length_dict.items()] + for w in z + } return rids def get_unique_edge_list(self, path_length_dict: PathLengthDict) -> Dict[Rid, EdgeInfo]: @@ -990,16 +1105,18 @@ def get_unique_node_list(self, path_length_dict: PathLengthDict) -> Dict[Rid, No def get_edge_info(self, rid: Rid): """Get edge metadata by given rID.""" - sql = f"Select out.@rid.asString() as subject_rid, in.@rid.asString() as object_rid, " \ - f"out.bel as subject_bel, in.bel as object_bel," \ - f"@class.asString() as class, citation, evidence, pmid, annotation.mesh from {rid}" + sql = ( + f"Select out.@rid.asString() as subject_rid, in.@rid.asString() as object_rid, " + f"out.bel as subject_bel, in.bel as object_bel," + f"@class.asString() as class, citation, evidence, pmid, annotation.mesh from {rid}" + ) return self.execute(sql)[0].oRecordData def get_node_info(self, rid: Rid): """Get node metadata by given rID.""" sql = f"Select @class.asString() as class, * from {rid}" data = self.execute(sql)[0].oRecordData - serializable_columns = get_columns(data['class'], exclude_non_serializable=True) + ['class'] + serializable_columns = get_columns(data["class"], exclude_non_serializable=True) + ["class"] serializable_data = {k: v for k, v in data.items() if k in serializable_columns} return serializable_data @@ -1013,16 +1130,16 @@ def allowed_edges(self) -> List[str]: def get_match_return(self, number_of_edges): """Standard way to return node and edges.""" edges = [f"e{i}.@rid.asString()" for i in range(1, number_of_edges + 1)] - edges_join = ','.join(edges) + edges_join = ",".join(edges) nodes = [f"n{i}.@rid.asString()" for i in range(1, number_of_edges + 2)] - nodes_join = ','.join(nodes) + nodes_join = ",".join(nodes) if self.limit and self.limit <= (self.max_paths + 1): limit = self.limit else: limit = self.max_paths + 1 - skip = f" skip {self.skip} " if self.skip else '' + skip = f" skip {self.skip} " if self.skip else "" return f" return [{edges_join}] as edges, [{nodes_join}] as nodes {skip} limit {limit}" @@ -1042,35 +1159,41 @@ def get_belish_match_str(self, edges, node_strings): """Build the MATCH string based on the BELish query.""" # ALERT: if a where in the following node multi class of edge are ignored edge_direction = { - '->': {'one_class': ".outE(){{class:{},as:e{}{}}}.inV()", 'multi_class': ".outE({}){{as:e{}{}}}.inV()"}, - '<-': {'one_class': ".inE(){{class:{},as:e{}{}}}.outV()", 'multi_class': ".inE({}){{as:e{}{}}}.outV()"}, + "->": { + "one_class": ".outE(){{class:{},as:e{}{}}}.inV()", + "multi_class": ".outE({}){{as:e{}{}}}.inV()", + }, + "<-": { + "one_class": ".inE(){{class:{},as:e{}{}}}.outV()", + "multi_class": ".inE({}){{as:e{}{}}}.outV()", + }, } re_node_in_box = re.compile( r'^\[\s*(?P\w+)(?P(\s+\w+(\.\w+)?(!=|=|>|<|~|\*)(\d+|\d+\.\d+|[\w%]+|"[^"]+"))*)\s*\]$' ) - match_str = 'match ' + match_str = "match " for i in range(len(node_strings)): found_node_in_box = re_node_in_box.search(node_strings[i]) if found_node_in_box: - node_where = '' + node_where = "" node_groups = found_node_in_box.groupdict() - if node_groups['params']: - node_where = self.get_where_list_by_params(node_groups['params']) + if node_groups["params"]: + node_where = self.get_where_list_by_params(node_groups["params"]) match_str += f"{{class:{node_groups['class_name']} {node_where}, as:n{i + 1}}}" else: match_str += f"{{class:bel, where:(bel like '{node_strings[i]}'), as:n{i + 1}}}" if i <= len(edges) - 1: edge_temp = edge_direction[edges[i].direction] - edge_class_names = [x.strip() for x in edges[i].name.split(',') if x.strip()] - edge_where = '' + edge_class_names = [x.strip() for x in edges[i].name.split(",") if x.strip()] + edge_where = "" if edges[i].params_str: edge_where = self.get_where_list_by_params(edges[i].params_str) if len(edge_class_names) == 1: - match_str += edge_temp['one_class'].format(edge_class_names[0], i + 1, edge_where) + match_str += edge_temp["one_class"].format(edge_class_names[0], i + 1, edge_where) else: - edge_class_names_joined = ','.join([f'"{x}"' for x in edge_class_names]) - match_str += edge_temp['multi_class'].format(edge_class_names_joined, i + 1, edge_where) + edge_class_names_joined = ",".join([f'"{x}"' for x in edge_class_names]) + match_str += edge_temp["multi_class"].format(edge_class_names_joined, i + 1, edge_where) match_str += self.get_match_return(len(edges)) return match_str @@ -1081,34 +1204,46 @@ def get_where_list_by_params(params_str): where_list = [] re_params_in_box = re.compile(r'(\w+(\.\w+)?)(!=|=|>|<|~|\*)(\d+|\d+\.\d+|[\w%]+|"[^"]+")') for param, sub_param, operator, value in re_params_in_box.findall(params_str): - operator = 'like' if operator == '~' else operator - operator = 'in' if operator == '*' else operator - equals_or_in_and_number = operator in ['=', 'in'] and re.search(r'^\d+(\.\d+)?$', value) + operator = "like" if operator == "~" else operator + operator = "in" if operator == "*" else operator + equals_or_in_and_number = operator in ["=", "in"] and re.search(r"^\d+(\.\d+)?$", value) quotes_surrounded = re.search('^".*"$', value) - if not (operator in ['>', '<'] or equals_or_in_and_number or quotes_surrounded): + if not (operator in [">", "<"] or equals_or_in_and_number or quotes_surrounded): value = f'"{value}"' - if operator == 'in': + if operator == "in": where_list.append(f"{value} {operator} {param}") else: where_list.append(f"{param} {operator} {value}") - where = ", where:(" + ' AND '.join(where_list) + ")" + where = ", where:(" + " AND ".join(where_list) + ")" return where def get_belish_nodes_edges(self) -> Tuple[List[str], List[BELishEdge]]: """Return all BELish nodes and edges.""" - r = re.split(r"\s+(-(([a-z_0-9,]+)(\s+.*?)?)(->)|(<-)(([a-z_0-9,]+)(\s+.*?)?)-|-(->)|(<-)-)\s+", self.belish) + r = re.split( + r"\s+(-(([a-z_0-9,]+)(\s+.*?)?)(->)|(<-)(([a-z_0-9,]+)(\s+.*?)?)-|-(->)|(<-)-)\s+", + self.belish, + ) nodes: List[str] = r[::12] - edge_zip = zip(r[8::12], r[3::12], r[1::12], r[5::12], r[6::12], r[10::12], r[11::12], r[4::12]) + edge_zip = zip( + r[8::12], + r[3::12], + r[1::12], + r[5::12], + r[6::12], + r[10::12], + r[11::12], + r[4::12], + ) edges: List[BELishEdge] = [ - BELishEdge(x[0] or x[1] or '', x[3] or x[4] or x[5] or x[6], x[7]) for x in edge_zip + BELishEdge(x[0] or x[1] or "", x[3] or x[4] or x[5] or x[6], x[7]) for x in edge_zip ] return nodes, edges def get_paths(self) -> Union[PathsResult, Dict]: """Get paths by query.""" self.max_paths = 100000 - if self.edge_class and not (self.edge_class in self.allowed_edges or self.edge_class == 'E'): - return {'error': "Unknown relation type."} + if self.edge_class and not (self.edge_class in self.allowed_edges or self.edge_class == "E"): + return {"error": "Unknown relation type."} path_length_dict: PathLengthDict = {} edge_paths_by_length: EdgePathsByLength = {} @@ -1119,19 +1254,21 @@ def get_paths(self) -> Union[PathsResult, Dict]: paths: List[Dict[str, Any]] = [x.oRecordData for x in self.execute(query_str)] if len(paths) > self.max_paths: - return {'error': self.too_many_paths.format(number_of_edges)} + return {"error": self.too_many_paths.format(number_of_edges)} path_length_dict[number_of_edges] = paths - edge_paths_by_length[number_of_edges] = [x['edges'] for x in paths] + edge_paths_by_length[number_of_edges] = [x["edges"] for x in paths] unique_edges: Dict[Rid, EdgeInfo] = self.get_unique_edge_list(path_length_dict) if len(unique_edges) > self.max_unique_edges: - return {'error': self.too_many_edges.format(len(unique_edges))} + return {"error": self.too_many_edges.format(len(unique_edges))} unique_nodes: Dict[Rid, NodeInfo] = self.get_unique_node_list(path_length_dict) - paths_results = PathsResult(edge_paths_by_length=edge_paths_by_length, - unique_edges=unique_edges, - unique_nodes=unique_nodes) + paths_results = PathsResult( + edge_paths_by_length=edge_paths_by_length, + unique_edges=unique_edges, + unique_nodes=unique_nodes, + ) return paths_results def get_paths_by_belish(self): @@ -1146,18 +1283,20 @@ def get_paths_by_belish(self): query_str, number_of_edges = query_str_belish_num_edges paths: List[Dict[str, Any]] = [x.oRecordData for x in self.execute(query_str)] if len(paths) > self.max_paths: - return {'error': self.too_many_paths.format(number_of_edges)} + return {"error": self.too_many_paths.format(number_of_edges)} path_length_dict[number_of_edges] = paths - edge_paths_by_length[number_of_edges] = [x['edges'] for x in paths] + edge_paths_by_length[number_of_edges] = [x["edges"] for x in paths] unique_edges: Dict[Rid, EdgeInfo] = self.get_unique_edge_list(path_length_dict) if len(unique_edges) > self.max_unique_edges: - return {'error': self.too_many_edges.format(len(unique_edges))} + return {"error": self.too_many_edges.format(len(unique_edges))} unique_nodes: Dict[Rid, NodeInfo] = self.get_unique_node_list(path_length_dict) - paths_results = PathsResult(edge_paths_by_length=edge_paths_by_length, - unique_edges=unique_edges, - unique_nodes=unique_nodes) + paths_results = PathsResult( + edge_paths_by_length=edge_paths_by_length, + unique_edges=unique_edges, + unique_nodes=unique_nodes, + ) return paths_results @@ -1189,39 +1328,39 @@ def _get_path_query() -> Union[PathQuery, ErrorMessage]: ------ ErrorMessage """ - start_name = request.args.get('start_node_name') - end_name = request.args.get('end_node_name') - start_how_name = request.args.get('start_how_node_name') - end_how_name = request.args.get('end_how_node_name') - start_class = request.args.get('start_node_class', 'bel') - end_class = request.args.get('end_node_class', 'bel') - start_ns = request.args.get('start_node_namespace') - end_ns = request.args.get('end_node_namespace') - start_bel = request.args.get('start_bel') - start_how_bel = request.args.get('start_how_bel') - end_bel = request.args.get('end_bel') - end_how_bel = request.args.get('end_how_bel') - edge_class = request.args.get('connecting_relation') - multiple_edge_classes = request.args.get('multiple_connecting_relations', '') - inside_node_class = request.args.get('connecting_node_class') - gene_path = request.args.get('only_gene_related_nodes_on_path') - pmid = request.args.get('pmid', '') - mesh_term = request.args.get('mesh_term', '') - gene_path = True if gene_path == 'true' else False - limit = request.args.get('limit', '') + start_name = request.args.get("start_node_name") + end_name = request.args.get("end_node_name") + start_how_name = request.args.get("start_how_node_name") + end_how_name = request.args.get("end_how_node_name") + start_class = request.args.get("start_node_class", "bel") + end_class = request.args.get("end_node_class", "bel") + start_ns = request.args.get("start_node_namespace") + end_ns = request.args.get("end_node_namespace") + start_bel = request.args.get("start_bel") + start_how_bel = request.args.get("start_how_bel") + end_bel = request.args.get("end_bel") + end_how_bel = request.args.get("end_how_bel") + edge_class = request.args.get("connecting_relation") + multiple_edge_classes = request.args.get("multiple_connecting_relations", "") + inside_node_class = request.args.get("connecting_node_class") + gene_path = request.args.get("only_gene_related_nodes_on_path") + pmid = request.args.get("pmid", "") + mesh_term = request.args.get("mesh_term", "") + gene_path = True if gene_path == "true" else False + limit = request.args.get("limit", "") limit = int(limit) if limit.isnumeric() else 0 - belish = request.args.get('belish', '') + belish = request.args.get("belish", "") min_value = 1 max_value = 10 - min_str = request.args.get('min_path_length') + min_str = request.args.get("min_path_length") min_length = _get_number(min_str, 1, min_value=min_value) - max_str = request.args.get('max_path_length') + max_str = request.args.get("max_path_length") max_length = _get_number(max_str, 3, max_value=max_value) if pmid: - edge_class = 'bel_relation' - end_class = 'bel' + edge_class = "bel_relation" + end_class = "bel" max_length = 1 path_query = PathQuery( @@ -1246,13 +1385,14 @@ def _get_path_query() -> Union[PathQuery, ErrorMessage]: mesh_term=mesh_term, pmids=pmid, belish=belish, - limit=limit) + limit=limit, + ) return path_query def get_paths() -> Union[dict, PathQuery]: """Return paths found for query.""" - print('get_paths:\n\n', request.args) + print("get_paths:\n\n", request.args) path_query = _get_path_query() if isinstance(path_query, PathQuery): paths = path_query.get_paths() @@ -1269,7 +1409,7 @@ def get_paths_by_belish() -> Union[dict, PathQuery]: path_query = _get_path_query() if isinstance(path_query, PathQuery): paths_by_belish = path_query.get_paths_by_belish() - print('path_query:', type(paths_by_belish)) + print("path_query:", type(paths_by_belish)) if isinstance(paths_by_belish, dict): return paths_by_belish else: @@ -1295,58 +1435,68 @@ def _get_paths_as_dot(paths): if isinstance(paths, PathsResult): edges = defaultdict(int) d = Digraph() - d.attr('graph', fontname="helvetica") - d.attr('node', shape='note') + d.attr("graph", fontname="helvetica") + d.attr("node", shape="note") if len(paths.unique_nodes) == 0: - row_template = '{}:' \ - '{}' + row_template = '{}:' '{}' key_value_dict = {} for k, v in request.args.items(): - key_value_dict[k] = cgi.html.escape(v).encode('ascii', 'xmlcharrefreplace').decode("utf-8") - legend_rows = '
'.join([row_template.format(k, v) for k, v in key_value_dict.items()]) - d.node('legend', f'<NO PATHS FOUND!
{legend_rows}>') + key_value_dict[k] = cgi.html.escape(v).encode("ascii", "xmlcharrefreplace").decode("utf-8") + legend_rows = "
".join([row_template.format(k, v) for k, v in key_value_dict.items()]) + d.node( + "legend", + f'<NO PATHS FOUND!
{legend_rows}>', + ) - d.attr('node', shape='box') - d.attr('node', style='filled') + d.attr("node", shape="box") + d.attr("node", style="filled") for rid, v in paths.unique_nodes.items(): - node_id = rid.replace(':', '.') - d.attr('node', fillcolor=node_colours.get(v['class'], 'grey')) - view_labels = get_node_view_labels(v['class']) - - sub_label = view_labels['sub_label'] - if 'involved_genes' in v or 'involved_other' in v: - involved = ','.join( - v['involved_genes'] + v['involved_other'] - ).replace('<', '<').replace('>', '>') - bel_str = v["bel"].replace('<', '<').replace('>', '>') - node_label = f'<{v["class"]}
{involved}'\ - f'
{bel_str}>' + node_id = rid.replace(":", ".") + d.attr("node", fillcolor=node_colours.get(v["class"], "grey")) + view_labels = get_node_view_labels(v["class"]) + + sub_label = view_labels["sub_label"] + if "involved_genes" in v or "involved_other" in v: + involved = ( + ",".join(v["involved_genes"] + v["involved_other"]).replace("<", "<").replace(">", ">") + ) + bel_str = v["bel"].replace("<", "<").replace(">", ">") + node_label = ( + f'<{v["class"]}
{involved}' + f'
{bel_str}>' + ) else: - label_col_list = view_labels['label'] - label_value = '' + label_col_list = view_labels["label"] + label_value = "" if len(label_col_list) == 1: label_value = v[label_col_list[0]] elif len(label_col_list) >= 1: - label_value = '; '.join([str(v.get(x, '')) for x in label_col_list]) + label_value = "; ".join([str(v.get(x, "")) for x in label_col_list]) - sub_label_value = '' + sub_label_value = "" if sub_label and v.get(sub_label[0]): sub_label_value = '
' + v[sub_label[0]] - node_label = f'<{v["class"]}
' \ - f'{label_value}{sub_label_value}>' + node_label = ( + f'<{v["class"]}
' + f'{label_value}{sub_label_value}>' + ) d.node(node_id, node_label) for v in paths.unique_edges.values(): - s_rid = v['subject_rid'].replace(':', '.') - o_rid = v['object_rid'].replace(':', '.') - edges[(s_rid, o_rid, v['class'])] += 1 + s_rid = v["subject_rid"].replace(":", ".") + o_rid = v["object_rid"].replace(":", ".") + edges[(s_rid, o_rid, v["class"])] += 1 for edge, number_of_edge in edges.items(): s_rid, o_rid, label = edge - d.attr('edge', color=edge_colours.get(label, 'grey')) - d.edge(s_rid, o_rid, f'<{label} [{number_of_edge}]>') + d.attr("edge", color=edge_colours.get(label, "grey")) + d.edge( + s_rid, + o_rid, + f'<{label} [{number_of_edge}]>', + ) return d.source else: return paths @@ -1354,16 +1504,20 @@ def _get_paths_as_dot(paths): def get_publication_year_statistics(): """Return publication counts by year derived from BEL edges.""" - sql = "Select year, count(*) as number_of_edges from (Select citation.pub_date.left(4) as year " \ - "from bel_relation) where year!='' group by year order by year desc" + sql = ( + "Select year, count(*) as number_of_edges from (Select citation.pub_date.left(4) as year " + "from bel_relation) where year!='' group by year order by year desc" + ) return [x.oRecordData for x in Bel().execute(sql)] def get_class_infos(): """Return node or edge class metadata.""" b = Bel() - sql = "SELECT name, superClass, abstract, properties FROM (select expand(classes) " \ - "FROM metadata:schema) WHERE NOT (name LIKE 'O%' OR name like '_%')" + sql = ( + "SELECT name, superClass, abstract, properties FROM (select expand(classes) " + "FROM metadata:schema) WHERE NOT (name LIKE 'O%' OR name like '_%')" + ) class_name_dict = {} parent_dict = defaultdict(list) in_out_dict = {} @@ -1371,83 +1525,85 @@ def get_class_infos(): out_edge_class_dict = defaultdict(list) for row in b.execute(sql): r = dict(row.oRecordData) - in_out = {p['name']: p['linkedClass'] for p in r['properties'] if - 'linkedClass' in p and p['name'] in ['in', 'out']} + in_out = { + p["name"]: p["linkedClass"] for p in r["properties"] if "linkedClass" in p and p["name"] in ["in", "out"] + } if in_out: - in_out_dict[r['name']] = in_out - in_edge_class_dict[in_out['in']].append(r['name']) - out_edge_class_dict[in_out['out']].append(r['name']) - r.pop('properties') - class_name_dict[r['name']] = r - if r.get('superClass'): # all except roots - parent_dict[r['superClass']].append({'name': r['name'], 'abstract': r['abstract']}) + in_out_dict[r["name"]] = in_out + in_edge_class_dict[in_out["in"]].append(r["name"]) + out_edge_class_dict[in_out["out"]].append(r["name"]) + r.pop("properties") + class_name_dict[r["name"]] = r + if r.get("superClass"): # all except roots + parent_dict[r["superClass"]].append({"name": r["name"], "abstract": r["abstract"]}) results = {} for class_name in class_name_dict: cnd = class_name_dict[class_name] - result = {'abstract': cnd['abstract'], - 'parents_path': [class_name], - 'children': parent_dict[class_name], - } + result = { + "abstract": cnd["abstract"], + "parents_path": [class_name], + "children": parent_dict[class_name], + } check4parent = True while check4parent: - last_parent = result['parents_path'][-1] + last_parent = result["parents_path"][-1] if last_parent not in class_name_dict: break - parent = class_name_dict[last_parent].get('superClass') + parent = class_name_dict[last_parent].get("superClass") if parent: - result['parents_path'].append(parent) + result["parents_path"].append(parent) else: check4parent = False results[class_name] = result # get in_out from parents - for parent in result['parents_path']: + for parent in result["parents_path"]: if parent in in_out_dict: - results[class_name]['in_out'] = in_out_dict[parent] + results[class_name]["in_out"] = in_out_dict[parent] break if class_name in in_edge_class_dict: - results[class_name]['in_relations'] = in_edge_class_dict[class_name] + results[class_name]["in_relations"] = in_edge_class_dict[class_name] if class_name in out_edge_class_dict: - results[class_name]['out_relations'] = out_edge_class_dict[class_name] + results[class_name]["out_relations"] = out_edge_class_dict[class_name] return results def get_class_info_by_name(): """Return node type by given name.""" results = get_class_infos() - return results.get(request.args.get('name')) + return results.get(request.args.get("name")) def get_class_infos_by_parent_name(childs_of) -> dict: """Get node or edge class information as DOT.""" results = get_class_infos() - return {k: v for k, v in results.items() if childs_of in v['parents_path']} + return {k: v for k, v in results.items() if childs_of in v["parents_path"]} def _get_class_info_as_dot(get_class_method): """Get node or edge class information as DOT.""" classes = get_class_method() graph = Digraph() - graph.graph_attr['rankdir'] = 'LR' - graph.node_attr['shape'] = 'plaintext' + graph.graph_attr["rankdir"] = "LR" + graph.node_attr["shape"] = "plaintext" for node_name, v in classes.items(): - for child in [x['name'] for x in v['children']]: + for child in [x["name"] for x in v["children"]]: graph.edge(node_name, child) return graph.source def get_bel_node_types(): """Return BEL nodes and their metadata.""" - return get_class_infos_by_parent_name('bel') + return get_class_infos_by_parent_name("bel") def get_all_node_types(): """Return BEL nodes and their metadata.""" - bel_node_types = get_class_infos_by_parent_name('bel') - bel_node_types.update(get_class_infos_by_parent_name('ebel')) + bel_node_types = get_class_infos_by_parent_name("bel") + bel_node_types.update(get_class_infos_by_parent_name("ebel")) return bel_node_types @@ -1458,7 +1614,7 @@ def get_bel_node_types_as_dot(): def get_ebel_node_types(): """Return eBEL added nodes and their metadata.""" - return get_class_infos_by_parent_name('ebel') + return get_class_infos_by_parent_name("ebel") def get_ebel_node_types_as_dot(): @@ -1468,7 +1624,7 @@ def get_ebel_node_types_as_dot(): def get_bel_relation_types(): """Return BEL edges and their metadata.""" - return get_class_infos_by_parent_name('bel_relation') + return get_class_infos_by_parent_name("bel_relation") def get_bel_relation_types_as_dot(): @@ -1478,13 +1634,14 @@ def get_bel_relation_types_as_dot(): def get_ebel_relation_types(): """Return eBEL added edges and their metadata.""" - return get_class_infos_by_parent_name('ebel_relation') + return get_class_infos_by_parent_name("ebel_relation") def get_ebel_relation_types_as_dot(): """Return eBEL added edges as DOT.""" return _get_class_info_as_dot(get_ebel_relation_types) + # ApiResult = namedtuple('ApiResult', ['results', 'number_of_results', 'page', 'pages','page_size']) @@ -1508,12 +1665,20 @@ def get_documents(): from bel_document""" results = [x.oRecordData for x in Bel().execute(sql)] len_results = len(results) - return {'results': results, 'number_of_results': len_results, 'page': 1, 'pages': 1, 'page_size': len_results} + return { + "results": results, + "number_of_results": len_results, + "page": 1, + "pages": 1, + "page_size": len_results, + } def get_pmids(): """Return all PMIDs and their counts from BEL edges.""" - sql = "Select pmid, count(*) as number_of_edges, citation" \ - " from bel_relation " \ - "where pmid!=0 group by pmid order by number_of_edges desc" + sql = ( + "Select pmid, count(*) as number_of_edges, citation" + " from bel_relation " + "where pmid!=0 group by pmid order by number_of_edges desc" + ) return [x.oRecordData for x in Bel().execute(sql)] diff --git a/ebel/web/api/ebel/v1/bel_against_expression.py b/ebel/web/api/ebel/v1/bel_against_expression.py index 28c002a..7ee4e39 100644 --- a/ebel/web/api/ebel/v1/bel_against_expression.py +++ b/ebel/web/api/ebel/v1/bel_against_expression.py @@ -1,24 +1,31 @@ """Test BEL relations against Expression Atlas.""" -from ebel.manager.rdbms.models.expression_atlas import FoldChange, GroupComparison, Experiment +from collections import namedtuple from math import ceil -from typing import List, Dict, Tuple, Set +from typing import Dict, List, Set, Tuple + from flask import request -from collections import namedtuple from sqlalchemy.sql import func from ebel import Bel +from ebel.manager.rdbms.models.expression_atlas import (Experiment, FoldChange, + GroupComparison) from ebel.web.api.ebel.v1 import _get_pagination -Relation = namedtuple('Relation', ['sub', 'sub_bel', 'rel_rid', 'evidence', 'rel', 'pmid', 'obj', 'obj_bel']) -ComparisonGroupValue = namedtuple('CompGroupFoldChange', ['group_comparison_id', 'log2foldchange', 'p_value']) +Relation = namedtuple( + "Relation", + ["sub", "sub_bel", "rel_rid", "evidence", "rel", "pmid", "obj", "obj_bel"], +) +ComparisonGroupValue = namedtuple("CompGroupFoldChange", ["group_comparison_id", "log2foldchange", "p_value"]) -def get_fold_changes(session, - gene_name: str, - l2fc_threshold: float, - pv_threshold: float, - experiment_names: List[str]) -> Tuple[List[ComparisonGroupValue], List[ComparisonGroupValue]]: +def get_fold_changes( + session, + gene_name: str, + l2fc_threshold: float, + pv_threshold: float, + experiment_names: List[str], +) -> Tuple[List[ComparisonGroupValue], List[ComparisonGroupValue]]: """Get fold change values for a given set of passed parameters. Parameters @@ -38,21 +45,23 @@ def get_fold_changes(session, ------- Tuple. """ - query = session.query( - FoldChange.group_comparison_id, - func.avg(FoldChange.log2foldchange), - FoldChange.p_value)\ - .filter_by(gene_name=gene_name)\ + query = ( + session.query( + FoldChange.group_comparison_id, + func.avg(FoldChange.log2foldchange), + FoldChange.p_value, + ) + .filter_by(gene_name=gene_name) .group_by(FoldChange.group_comparison_id, FoldChange.gene_id) + ) if experiment_names: query = query.join(GroupComparison).join(Experiment).filter(Experiment.name.in_(experiment_names)) - query_up = query.filter( - FoldChange.log2foldchange >= l2fc_threshold, - FoldChange.p_value <= pv_threshold) + query_up = query.filter(FoldChange.log2foldchange >= l2fc_threshold, FoldChange.p_value <= pv_threshold) query_down = query.filter( FoldChange.log2foldchange <= -1 * l2fc_threshold, - FoldChange.p_value <= pv_threshold) + FoldChange.p_value <= pv_threshold, + ) up_all = [ComparisonGroupValue(*x) for x in query_up.all()] down_all = [ComparisonGroupValue(*x) for x in query_down.all()] @@ -63,19 +72,19 @@ def validate_bel_against_experiment() -> dict: """Check whether BEL edges match experimental results.""" pagination = _get_pagination() - subject_class = request.args.get('subject_class', 'genetic_flow') - subject_name = request.args.get('subject_hgnc_symbol') - object_class = request.args.get('object_class', 'genetic_flow') - object_name = request.args.get('object_hgnc_symbol') - subject_l2fc_threshold = float(request.args.get('subject_log2foldchange_threshold', '2')) - subject_pv_threshold = float(request.args.get('subject_p_value_threshold', '0.05')) - object_l2fc_threshold = float(request.args.get('object_log2foldchange_threshold', '2')) - object_pv_threshold = float(request.args.get('object_p_value_threshold', '0.05')) + subject_class = request.args.get("subject_class", "genetic_flow") + subject_name = request.args.get("subject_hgnc_symbol") + object_class = request.args.get("object_class", "genetic_flow") + object_name = request.args.get("object_hgnc_symbol") + subject_l2fc_threshold = float(request.args.get("subject_log2foldchange_threshold", "2")) + subject_pv_threshold = float(request.args.get("subject_p_value_threshold", "0.05")) + object_l2fc_threshold = float(request.args.get("object_log2foldchange_threshold", "2")) + object_pv_threshold = float(request.args.get("object_p_value_threshold", "0.05")) - experiment_names = [x.strip() for x in request.args.get('experiment_names', '').split(',') if x.strip()] + experiment_names = [x.strip() for x in request.args.get("experiment_names", "").split(",") if x.strip()] - subject_sql = f" and name = '{subject_name}'" if subject_name else '' - object_sql = f" and name = '{object_name}'" if object_name else '' + subject_sql = f" and name = '{subject_name}'" if subject_name else "" + object_sql = f" and name = '{object_name}'" if object_name else "" sql_basis = f"""from ( match {{class:{subject_class}, as:sub, @@ -97,7 +106,7 @@ def validate_bel_against_experiment() -> dict: bel = Bel() session = bel.session - number_of_results = bel.execute("Select count(*) " + sql_basis)[0].oRecordData['count'] + number_of_results = bel.execute("Select count(*) " + sql_basis)[0].oRecordData["count"] sql = "Select * " + sql_basis + f" limit {pagination.page_size} skip {pagination.skip}" relations: List[Relation] = [Relation(**x.oRecordData) for x in bel.execute(sql)] @@ -105,22 +114,26 @@ def validate_bel_against_experiment() -> dict: gc_ids = set() for gene_name_sub, gene_name_obj in {(x.sub, x.obj) for x in relations}: - sub_down, sub_up = get_fold_changes(session, - gene_name_sub, - subject_l2fc_threshold, - subject_pv_threshold, - experiment_names) + sub_down, sub_up = get_fold_changes( + session, + gene_name_sub, + subject_l2fc_threshold, + subject_pv_threshold, + experiment_names, + ) sub_down_gc_set = {x.group_comparison_id for x in sub_down} sub_up_gc_set = {x.group_comparison_id for x in sub_up} if gene_name_sub not in dea_results: dea_results[gene_name_sub] = {} - obj_down, obj_up = get_fold_changes(session, - gene_name_obj, - object_l2fc_threshold, - object_pv_threshold, - experiment_names) + obj_down, obj_up = get_fold_changes( + session, + gene_name_obj, + object_l2fc_threshold, + object_pv_threshold, + experiment_names, + ) obj_down_gc_set = {x.group_comparison_id for x in obj_down} obj_up_gc_set = {x.group_comparison_id for x in obj_up} @@ -129,29 +142,30 @@ def validate_bel_against_experiment() -> dict: # opposite direction sub_up_obj_down_values = get_values_for_intersection(sub_up, obj_down, sub_up_gc_set, obj_down_gc_set) gc_ids.update(set(sub_up_obj_down_values.keys())) - res['sub_up_obj_down'] = sub_up_obj_down_values + res["sub_up_obj_down"] = sub_up_obj_down_values sub_down_obj_up_values = get_values_for_intersection(sub_down, obj_up, sub_down_gc_set, obj_up_gc_set) gc_ids.update(set(sub_down_obj_up_values.keys())) - res['sub_down_obj_up'] = sub_down_obj_up_values + res["sub_down_obj_up"] = sub_down_obj_up_values # same direction sub_up_obj_up_values = get_values_for_intersection(sub_up, obj_up, sub_up_gc_set, obj_up_gc_set) gc_ids.update(set(sub_up_obj_up_values.keys())) - res['sub_up_obj_up'] = sub_up_obj_up_values + res["sub_up_obj_up"] = sub_up_obj_up_values sub_down_obj_down_values = get_values_for_intersection(sub_down, obj_down, sub_down_gc_set, obj_down_gc_set) gc_ids.update(set(sub_down_obj_down_values.keys())) - res['sub_down_obj_down'] = sub_down_obj_down_values + res["sub_down_obj_down"] = sub_down_obj_down_values - return {'dea': dea_results, - 'group_comparisons': get_group_experiment_by_gc_ids(session, gc_ids), - 'bel_relations': get_bel_relations(relations, dea_results), - 'page': pagination.page, - 'page_size': pagination.page_size, - 'number_of_results': number_of_results, - 'pages': ceil(number_of_results / pagination.page_size) - } + return { + "dea": dea_results, + "group_comparisons": get_group_experiment_by_gc_ids(session, gc_ids), + "bel_relations": get_bel_relations(relations, dea_results), + "page": pagination.page, + "page_size": pagination.page_size, + "number_of_results": number_of_results, + "pages": ceil(number_of_results / pagination.page_size), + } def get_bel_relations(relations: List[Relation], dea_results) -> List[dict]: @@ -170,14 +184,18 @@ def get_bel_relations(relations: List[Relation], dea_results) -> List[dict]: for relation in relations: dea = dea_results[relation.sub][relation.obj] rel_dict = relation._asdict() - same_direction = list(dea['sub_up_obj_up'].keys()) + list(dea['sub_down_obj_down'].keys()) - opposite_direction = list(dea['sub_up_obj_down'].keys()) + list(dea['sub_down_obj_up'].keys()) - if relation.rel in ['directly_increases', 'increases', 'positive_correlation']: - rel_dict['supported_by'] = same_direction - rel_dict['in_contradiction_to'] = opposite_direction - elif relation.rel in ['directly_decreases', 'decreases', 'negative_correlation']: - rel_dict['supported_by'] = opposite_direction - rel_dict['in_contradiction_to'] = same_direction + same_direction = list(dea["sub_up_obj_up"].keys()) + list(dea["sub_down_obj_down"].keys()) + opposite_direction = list(dea["sub_up_obj_down"].keys()) + list(dea["sub_down_obj_up"].keys()) + if relation.rel in ["directly_increases", "increases", "positive_correlation"]: + rel_dict["supported_by"] = same_direction + rel_dict["in_contradiction_to"] = opposite_direction + elif relation.rel in [ + "directly_decreases", + "decreases", + "negative_correlation", + ]: + rel_dict["supported_by"] = opposite_direction + rel_dict["in_contradiction_to"] = same_direction results.append(rel_dict) return results @@ -200,19 +218,21 @@ def get_group_experiment_by_gc_ids(session, group_comparison_ids: Set[int]) -> d for group_comparison_id in group_comparison_ids: gc: GroupComparison = session.query(GroupComparison).filter_by(id=group_comparison_id).first() results[group_comparison_id] = { - 'experiment_id': gc.experiment.id, - 'experiment': gc.experiment.name, - 'experiment_title': gc.experiment.title, - 'group_comparison': gc.group_comparison, - 'group_comparison_title': gc.name + "experiment_id": gc.experiment.id, + "experiment": gc.experiment.name, + "experiment_title": gc.experiment.title, + "group_comparison": gc.group_comparison, + "group_comparison_title": gc.name, } return results -def get_values_for_intersection(sub: List[ComparisonGroupValue], - obj: List[ComparisonGroupValue], - sub_gc_set: Set[int], - obj_gc_set: Set[int]) -> Dict[int, dict]: +def get_values_for_intersection( + sub: List[ComparisonGroupValue], + obj: List[ComparisonGroupValue], + sub_gc_set: Set[int], + obj_gc_set: Set[int], +) -> Dict[int, dict]: """Get the log2 fold changes and p values for the intersection of a given subject and object. Parameters @@ -227,10 +247,13 @@ def get_values_for_intersection(sub: List[ComparisonGroupValue], Dictionary with group comparison ID as keys and the log2 fold changes + p values as a value dictionary. """ sub_obj_gc_ids = sub_gc_set & obj_gc_set - gc_sub_dict = {x.group_comparison_id: {'l2fc': x.log2foldchange, 'pv': x.p_value} for x in sub} - gc_obj_dict = {x.group_comparison_id: {'l2fc': x.log2foldchange, 'pv': x.p_value} for x in obj} + gc_sub_dict = {x.group_comparison_id: {"l2fc": x.log2foldchange, "pv": x.p_value} for x in sub} + gc_obj_dict = {x.group_comparison_id: {"l2fc": x.log2foldchange, "pv": x.p_value} for x in obj} results = {} for sub_obj_gc_id in sub_obj_gc_ids: - results[sub_obj_gc_id] = {'sub': gc_sub_dict[sub_obj_gc_id], 'obj': gc_obj_dict[sub_obj_gc_id]} + results[sub_obj_gc_id] = { + "sub": gc_sub_dict[sub_obj_gc_id], + "obj": gc_obj_dict[sub_obj_gc_id], + } return results diff --git a/ebel/web/api/ebel/v1/biogrid.py b/ebel/web/api/ebel/v1/biogrid.py index eb78954..ebc6563 100644 --- a/ebel/web/api/ebel/v1/biogrid.py +++ b/ebel/web/api/ebel/v1/biogrid.py @@ -1,26 +1,20 @@ """BioGRID API methods.""" import json - +from collections import namedtuple from math import ceil +from typing import Dict, List, Optional + from flask import request -from sqlalchemy import or_, and_ -from collections import namedtuple -from typing import List, Dict, Optional +from sqlalchemy import and_, or_ from ebel import Bel from ebel.manager.orientdb.biodbs.biogrid import MODIFICATIONS -from ebel.manager.rdbms.models.biogrid import ( - Biogrid, - ExperimentalSystem, - Source, - Modification, - Taxonomy, - Interactor, - Publication) +from ebel.manager.rdbms.models.biogrid import (Biogrid, ExperimentalSystem, + Interactor, Modification, + Publication, Source, Taxonomy) from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data - SQL_SELECT_HAS_PPI_BG = """select @rid.asString() as rid, in.@rid.asString() as subject_rid, @@ -43,11 +37,11 @@ def get_has_ppi_bg_by_symbol_taxid(): """Get has_ppi_bg edge by POST request with parameter `symbol`.""" - symbol = request.args.get('symbol') + symbol = request.args.get("symbol") if not symbol: - return {'error': "symbol is required"} - namespace = request.args.get('namespace', 'HGNC') - modification = request.args.get('modification') + return {"error": "symbol is required"} + namespace = request.args.get("namespace", "HGNC") + modification = request.args.get("modification") sql = f"""{SQL_SELECT_HAS_PPI_BG}((in.name='{symbol}' and in.namespace='{namespace}') or (out.name='{symbol}' and out.namespace='{namespace}'))""" @@ -58,10 +52,10 @@ def get_has_ppi_bg_by_symbol_taxid(): def get_has_ppi_bg_by_uniprot() -> List[dict]: """Get has_ppi_bg edge by POST request with parameter `uniprot`.""" - uniprot = request.args.get('uniprot') + uniprot = request.args.get("uniprot") if not uniprot: - return [{'error': "uniprot is required"}] - modification = request.args.get('modification') + return [{"error": "uniprot is required"}] + modification = request.args.get("modification") sql = f"""{SQL_SELECT_HAS_PPI_BG}( in.uniprot='{uniprot}' or out.uniprot='{uniprot}')""" if modification and modification in MODIFICATIONS: sql += f" and modification = '{modification}'" @@ -77,7 +71,7 @@ def get_biogrid_by_biogrid_id() -> dict: Returns: dict: BioGrid entry. """ - biogrid_id: int = request.args.get('biogrid_id') + biogrid_id: int = request.args.get("biogrid_id") biogrid_entry = RDBMS.get_session().query(Biogrid).filter_by(biogrid_id=biogrid_id).first() return biogrid_entry.as_dict() @@ -134,12 +128,10 @@ def get_biogrid_by_pmid() -> List[dict]: Returns: List[dict]: List[BioGrid entry] """ - pmid = request.args.get('pmid') - publication_id = RDBMS.get_session().query( - Publication.id - ).filter_by( - source='PUBMED', source_identifier=pmid - ).first()[0] + pmid = request.args.get("pmid") + publication_id = ( + RDBMS.get_session().query(Publication.id).filter_by(source="PUBMED", source_identifier=pmid).first()[0] + ) biogrid_entry = RDBMS.get_session().query(Biogrid).filter_by(publication_id=publication_id).all() return [x.as_dict() for x in biogrid_entry] @@ -164,25 +156,39 @@ def get_biogrid() -> dict: List[dict]: List[BioGrid entry] """ # cancel if neither interactor_a or interactor_b are submitted - if not (bool(request.args.get('interactor_a')) or bool(request.args.get('interactor_b'))): - return {'error': "At least interactor_a or interactor_b"} - - Req = namedtuple('RequestObject', ['id_type_a', 'interactor_a', 'taxonomy_id_a', 'id_type_b', 'interactor_b', - 'taxonomy_id_b', 'experimental_system', 'modification', 'source', - 'interaction_directed', 'page_size', 'page']) + if not (bool(request.args.get("interactor_a")) or bool(request.args.get("interactor_b"))): + return {"error": "At least interactor_a or interactor_b"} + + Req = namedtuple( + "RequestObject", + [ + "id_type_a", + "interactor_a", + "taxonomy_id_a", + "id_type_b", + "interactor_b", + "taxonomy_id_b", + "experimental_system", + "modification", + "source", + "interaction_directed", + "page_size", + "page", + ], + ) q = { - 'id_type_a': "symbol", - 'interactor_a': None, - 'taxonomy_id_a': 9606, - 'id_type_b': "symbol", - 'interactor_b': None, - 'taxonomy_id_b': 9606, - 'experimental_system': None, - 'modification': None, - 'source': None, - 'interaction_directed': False, - 'page_size': 10, - 'page': 1 + "id_type_a": "symbol", + "interactor_a": None, + "taxonomy_id_a": 9606, + "id_type_b": "symbol", + "interactor_b": None, + "taxonomy_id_b": 9606, + "experimental_system": None, + "modification": None, + "source": None, + "interaction_directed": False, + "page_size": 10, + "page": 1, } for key, value in request.args.items(): if key in q: @@ -190,49 +196,61 @@ def get_biogrid() -> dict: req = Req(**q) if req.interactor_a: - a = {req.id_type_a: req.interactor_a, 'taxonomy_id': req.taxonomy_id_a} + a = {req.id_type_a: req.interactor_a, "taxonomy_id": req.taxonomy_id_a} a = {k: v for k, v in a.items() if v} biogrid_a_ids = [x[0] for x in RDBMS.get_session().query(Interactor.biogrid_id).filter_by(**a)] if req.interactor_b: - b = {req.id_type_b: req.interactor_b, 'taxonomy_id': req.taxonomy_id_b} + b = {req.id_type_b: req.interactor_b, "taxonomy_id": req.taxonomy_id_b} b = {k: v for k, v in b.items() if v} biogrid_b_ids = [x[0] for x in RDBMS.get_session().query(Interactor.biogrid_id).filter_by(**b)] if req.interactor_a and req.interactor_b: - interaction_forward = and_(Biogrid.biogrid_a_id.in_(biogrid_a_ids), Biogrid.biogrid_b_id.in_(biogrid_b_ids)) + interaction_forward = and_( + Biogrid.biogrid_a_id.in_(biogrid_a_ids), + Biogrid.biogrid_b_id.in_(biogrid_b_ids), + ) if req.interaction_directed: query_filter = interaction_forward else: - interaction_backward = and_(Biogrid.biogrid_a_id.in_(biogrid_b_ids), - Biogrid.biogrid_b_id.in_(biogrid_a_ids)) + interaction_backward = and_( + Biogrid.biogrid_a_id.in_(biogrid_b_ids), + Biogrid.biogrid_b_id.in_(biogrid_a_ids), + ) query_filter = or_(interaction_forward, interaction_backward) else: if req.interactor_a: - if req.interaction_directed is True or req.interaction_directed == 'true': + if req.interaction_directed is True or req.interaction_directed == "true": query_filter = Biogrid.biogrid_a_id.in_(biogrid_a_ids) else: - query_filter = or_(Biogrid.biogrid_a_id.in_(biogrid_a_ids), Biogrid.biogrid_b_id.in_(biogrid_a_ids)) + query_filter = or_( + Biogrid.biogrid_a_id.in_(biogrid_a_ids), + Biogrid.biogrid_b_id.in_(biogrid_a_ids), + ) if req.interactor_b: - if req.interaction_directed is True or req.interaction_directed == 'true': + if req.interaction_directed is True or req.interaction_directed == "true": query_filter = Biogrid.biogrid_b_id.in_(biogrid_b_ids) else: - query_filter = or_(Biogrid.biogrid_a_id.in_(biogrid_b_ids), Biogrid.biogrid_b_id.in_(biogrid_b_ids)) + query_filter = or_( + Biogrid.biogrid_a_id.in_(biogrid_b_ids), + Biogrid.biogrid_b_id.in_(biogrid_b_ids), + ) query = RDBMS.get_session().query(Biogrid).filter(query_filter) if req.experimental_system: - experimental_system_id = RDBMS.get_session().query(ExperimentalSystem.id).filter_by( - experimental_system=req.experimental_system - ).first()[0] + experimental_system_id = ( + RDBMS.get_session() + .query(ExperimentalSystem.id) + .filter_by(experimental_system=req.experimental_system) + .first()[0] + ) query = query.filter_by(experimental_system_id=experimental_system_id) if req.modification: - modification_id = RDBMS.get_session().query( - Modification.id - ).filter_by( - modification=req.modification - ).first()[0] + modification_id = ( + RDBMS.get_session().query(Modification.id).filter_by(modification=req.modification).first()[0] + ) query = query.filter_by(modification_id=modification_id) if req.source: @@ -249,11 +267,11 @@ def get_biogrid() -> dict: pages = ceil(number_of_results / limit) return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': [x.as_dict() for x in query.all()] + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": [x.as_dict() for x in query.all()], } @@ -266,26 +284,30 @@ def get_interactor_by_symbol(): """Get interactor by gene symbol.""" ra = request.args - taxonomy_id = int(ra.get('taxonomy_id', 9606)) - search_term = ra.get('symbol', "") + taxonomy_id = int(ra.get("taxonomy_id", 9606)) + search_term = ra.get("symbol", "") - query = RDBMS.get_session().query(Interactor).filter( - Interactor.taxonomy_id == taxonomy_id, - Interactor.symbol.like(f'{search_term}%') + query = ( + RDBMS.get_session() + .query(Interactor) + .filter( + Interactor.taxonomy_id == taxonomy_id, + Interactor.symbol.like(f"{search_term}%"), + ) ) number_of_results = query.count() - limit = int(ra.get('page_size', 1)) - page = int(ra.get('page', 10)) + limit = int(ra.get("page_size", 1)) + page = int(ra.get("page", 10)) offset = (page - 1) * limit offset = offset if offset <= 100 else 100 query = query.limit(limit).offset(offset) pages = ceil(number_of_results / limit) return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': [x.as_dict() for x in query.all()] + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": [x.as_dict() for x in query.all()], } diff --git a/ebel/web/api/ebel/v1/chebi.py b/ebel/web/api/ebel/v1/chebi.py index 50b0c8c..b186eda 100644 --- a/ebel/web/api/ebel/v1/chebi.py +++ b/ebel/web/api/ebel/v1/chebi.py @@ -1,10 +1,12 @@ """CHEBI API methods.""" from math import ceil + from flask import request from ebel import Bel -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import chebi +from ebel.web.api import RDBMS + from . import _get_pagination @@ -37,10 +39,8 @@ def get_compound_name_by_name_starts_with(name: str): dict Names and CHEBI IDs. """ - query = RDBMS.get_session().query( - chebi.Compound.id, chebi.Compound.name - ).filter( - chebi.Compound.name.like(f"{name}%") + query = ( + RDBMS.get_session().query(chebi.Compound.id, chebi.Compound.name).filter(chebi.Compound.name.like(f"{name}%")) ) return {x.name: x.id for x in query.all()} @@ -90,10 +90,10 @@ def get_compound_by_other_db_accession(accession_number: str, db_name: str = Non def get_compound_reference(): """Compile a dictionary of information for a given compound.""" req = dict(request.args.copy()) - page_size = req.pop('page_size', 10) - page = req.pop('page', 1) + page_size = req.pop("page_size", 10) + page = req.pop("page", 1) if not req: - return {'error': "At least one of the parameters have to be filled."} + return {"error": "At least one of the parameters have to be filled."} query = RDBMS.get_session().query(chebi.Reference).filter_by(**req) number_of_results = query.count() @@ -105,18 +105,18 @@ def get_compound_reference(): pages = ceil(number_of_results / limit) return { - 'page': page, - 'page_size': limit, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': [x.as_dict_with_compound_id() for x in query.all()] + "page": page, + "page_size": limit, + "number_of_results": number_of_results, + "pages": pages, + "results": [x.as_dict_with_compound_id() for x in query.all()], } def get_relation(): """Get CHEBI defined relations.""" - if not (bool(request.args.get('final_id') or bool(request.args.get('init_id')))): - return {'error': "At least final_id or init_id have to be filled."} + if not (bool(request.args.get("final_id") or bool(request.args.get("init_id")))): + return {"error": "At least final_id or init_id have to be filled."} rs = RDBMS.get_session().query(chebi.Relation).filter_by(**request.args).all() return [x.as_dict() for x in rs] @@ -126,18 +126,18 @@ def get_bel_chebi_ids(): b = Bel() sql_count = "SELECT count(*) FROM V where chebi IS NOT NULL" p = _get_pagination() - number_of_results = b.query_get_dict(sql_count)[0]['count'] + number_of_results = b.query_get_dict(sql_count)[0]["count"] pages = ceil(number_of_results / p.page_size) query = "SELECT @rid.asString(), namespace, name, chebi, bel FROM V where chebi IS NOT NULL" - paras = {k: v for k, v in dict(request.args.copy()).items() if k in ['namespace', 'name', 'chebi']} + paras = {k: v for k, v in dict(request.args.copy()).items() if k in ["namespace", "name", "chebi"]} if paras: - query += " AND " + ' AND '.join([f"{k} like '{v.strip()}'" for k, v in paras.items()]) + query += " AND " + " AND ".join([f"{k} like '{v.strip()}'" for k, v in paras.items()]) print(query) return { - 'page': p.page, - 'page_size': p.page_size, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': b.query_get_dict(f"{query} LIMIT {p.page_size} SKIP {p.skip}") + "page": p.page, + "page_size": p.page_size, + "number_of_results": number_of_results, + "pages": pages, + "results": b.query_get_dict(f"{query} LIMIT {p.page_size} SKIP {p.skip}"), } diff --git a/ebel/web/api/ebel/v1/clinical_trials_gov.py b/ebel/web/api/ebel/v1/clinical_trials_gov.py index eeb56f5..88d91d2 100644 --- a/ebel/web/api/ebel/v1/clinical_trials_gov.py +++ b/ebel/web/api/ebel/v1/clinical_trials_gov.py @@ -3,14 +3,15 @@ from flask import request -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import clinical_trials_gov as ct -from ebel.web.api.ebel.v1 import _get_paginated_query_result, _get_terms_from_model_starts_with +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_paginated_query_result, + _get_terms_from_model_starts_with) def get_ct_by_nct_id(): """Get CT entry based on given NCT ID.""" - nct_id = request.args.get('nct_id') + nct_id = request.args.get("nct_id") c = RDBMS.get_session().query(ct.ClinicalTrialGov).filter_by(nct_id=nct_id).first() if c: return c.as_dict() @@ -31,9 +32,9 @@ def get_ct_by_nct_id(): def get_ct_by_mesh_term(): """Get CT entry based on given mesh term.""" - mesh_term = request.args.get('mesh_term', None) + mesh_term = request.args.get("mesh_term", None) if not mesh_term: - return {'error': "Mesh term is required."} + return {"error": "Mesh term is required."} query = RDBMS.get_session().query(ct.MeshTerm).filter(ct.MeshTerm.mesh_term.like(mesh_term)) return _get_paginated_query_result(query) @@ -41,10 +42,10 @@ def get_ct_by_mesh_term(): def get_ct_by_intervention(): """Get CT entry based on given intervention.""" - intervention_name = request.args.get('intervention_name', None) - intervention_type = request.args.get('intervention_type', None) + intervention_name = request.args.get("intervention_name", None) + intervention_type = request.args.get("intervention_type", None) if not (intervention_name or intervention_type): - return {'error': "At least name or type is required."} + return {"error": "At least name or type is required."} query = RDBMS.get_session().query(ct.Intervention) if intervention_type: @@ -57,9 +58,9 @@ def get_ct_by_intervention(): def get_ct_by_keyword(): """Get CT entry based on given keyword.""" - keyword = request.args.get('keyword', None) + keyword = request.args.get("keyword", None) if not keyword: - return {'error': "keyword is required."} + return {"error": "keyword is required."} query = RDBMS.get_session().query(ct.Keyword).filter(ct.Keyword.keyword.like(keyword)) return _get_paginated_query_result(query) @@ -67,9 +68,9 @@ def get_ct_by_keyword(): def get_ct_by_condition(): """Get CT entry based on given condition.""" - condition = request.args.get('condition', None) + condition = request.args.get("condition", None) if not condition: - return {'error': "Condition is required."} + return {"error": "Condition is required."} query = RDBMS.get_session().query(ct.Condition).filter(ct.Condition.condition.like(condition)) return _get_paginated_query_result(query) @@ -77,14 +78,14 @@ def get_ct_by_condition(): def get_mesh_term_starts_with(): """Get CT entry based on beginning of mesh term string.""" - return _get_terms_from_model_starts_with(form_field='mesh_term', sa_column=ct.MeshTerm.mesh_term) + return _get_terms_from_model_starts_with(form_field="mesh_term", sa_column=ct.MeshTerm.mesh_term) def get_keyword_starts_with(): """Get CT entry based on beginning of keyword string.""" - return _get_terms_from_model_starts_with(form_field='keyword', sa_column=ct.Keyword.keyword) + return _get_terms_from_model_starts_with(form_field="keyword", sa_column=ct.Keyword.keyword) def get_condition_starts_with(): """Get CT entry based on beginning of condition string.""" - return _get_terms_from_model_starts_with(form_field='condition', sa_column=ct.Condition.condition) + return _get_terms_from_model_starts_with(form_field="condition", sa_column=ct.Condition.condition) diff --git a/ebel/web/api/ebel/v1/clinvar.py b/ebel/web/api/ebel/v1/clinvar.py index 0f7d331..46feeac 100644 --- a/ebel/web/api/ebel/v1/clinvar.py +++ b/ebel/web/api/ebel/v1/clinvar.py @@ -1,13 +1,15 @@ """ClinVar API methods.""" +from math import ceil + from flask import request from ebel import Bel -from math import ceil -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import clinvar -from ebel.web.api.ebel.v1 import _get_data, _get_paginated_query_result, _get_terms_from_model_starts_with, \ - _get_pagination +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_query_result, + _get_pagination, + _get_terms_from_model_starts_with) def get_clinvar(): @@ -19,21 +21,29 @@ def get_clinvar_simple(): """Generic Clinvar result query.""" cc = clinvar.Clinvar cp = clinvar.ClinvarPhenotype - paras = {k: v for k, v in request.args.items() if - k in ['gene_id', 'gene_symbol', 'hgnc_id', 'allele_id', 'assembly', 'rs_db_snp']} - query = RDBMS.get_session().query(cc.id, cc.hgnc_id, cc.allele_id, cc.gene_symbol, cc.assembly, cc.rs_db_snp)\ - .filter_by(**paras).join(clinvar.clinvar__clinvar_phenotype).join(cp) - - phenotype = request.args.get('phenotype') + paras = { + k: v + for k, v in request.args.items() + if k in ["gene_id", "gene_symbol", "hgnc_id", "allele_id", "assembly", "rs_db_snp"] + } + query = ( + RDBMS.get_session() + .query(cc.id, cc.hgnc_id, cc.allele_id, cc.gene_symbol, cc.assembly, cc.rs_db_snp) + .filter_by(**paras) + .join(clinvar.clinvar__clinvar_phenotype) + .join(cp) + ) + + phenotype = request.args.get("phenotype") if phenotype: - query = query.filter_by(phenotype='Alzheimer disease') + query = query.filter_by(phenotype="Alzheimer disease") return _get_paginated_query_result(query.distinct(), return_dict=True) def get_phenotype_starts_with(): """Get Clinvar results by fuzzy phenotype search.""" - return _get_terms_from_model_starts_with('phenotype', clinvar.ClinvarPhenotype.phenotype) + return _get_terms_from_model_starts_with("phenotype", clinvar.ClinvarPhenotype.phenotype) def get_by_other_identifier(): @@ -41,16 +51,24 @@ def get_by_other_identifier(): cc = clinvar.Clinvar co = clinvar.ClinvarOtherIdentifier - db = request.args.get('db') - identifier = request.args.get('identifier') - query = RDBMS.get_session().query(co.db, - co.identifier, - cc.id, - cc.hgnc_id, - cc.allele_id, - cc.gene_symbol, - cc.assembly, - cc.rs_db_snp).join(co).filter_by(db=db).filter(co.identifier.like(identifier)) + db = request.args.get("db") + identifier = request.args.get("identifier") + query = ( + RDBMS.get_session() + .query( + co.db, + co.identifier, + cc.id, + cc.hgnc_id, + cc.allele_id, + cc.gene_symbol, + cc.assembly, + cc.rs_db_snp, + ) + .join(co) + .filter_by(db=db) + .filter(co.identifier.like(identifier)) + ) return _get_paginated_query_result(query.distinct(), return_dict=True) @@ -61,14 +79,21 @@ def get_by_medgen(): cm = clinvar.ClinvarPhenotypeMedgen # db = request.args.get('db') - identifier = request.args.get('identifier') - query = RDBMS.get_session().query(cm.identifier, - cc.id, - cc.hgnc_id, - cc.allele_id, - cc.gene_symbol, - cc.assembly, - cc.rs_db_snp).join(cm).filter_by(identifier=identifier) + identifier = request.args.get("identifier") + query = ( + RDBMS.get_session() + .query( + cm.identifier, + cc.id, + cc.hgnc_id, + cc.allele_id, + cc.gene_symbol, + cc.assembly, + cc.rs_db_snp, + ) + .join(cm) + .filter_by(identifier=identifier) + ) return _get_paginated_query_result(query.distinct(), return_dict=True) @@ -79,16 +104,18 @@ def get_ebel_relation(): p = _get_pagination() req = dict(request.args.copy()) wheres = [] - if req.get('rs_number'): + if req.get("rs_number"): wheres.append(f"in.rs_number='{req['rs_number']}'") - relation_type = req.get('relation_type') - allowed_relations = ['has_snp_clinvar', - 'has_mapped_snp_cv', - 'has_downstream_snp_cv', - 'has_upstream_snp_cv'] + relation_type = req.get("relation_type") + allowed_relations = [ + "has_snp_clinvar", + "has_mapped_snp_cv", + "has_downstream_snp_cv", + "has_upstream_snp_cv", + ] relation_type = relation_type if relation_type in allowed_relations else allowed_relations[0] - wheres += [f'out.{k} = "{v}"' for k, v in req.items() if k in ['namespace', 'name']] - wheres += [f'{k} = "{v}"' for k, v in req.items() if k in ['phenotype', 'keyword']] + wheres += [f'out.{k} = "{v}"' for k, v in req.items() if k in ["namespace", "name"]] + wheres += [f'{k} = "{v}"' for k, v in req.items() if k in ["phenotype", "keyword"]] columns = """in.rs_number as rs_number, @class.asString() as relation_type, @@ -99,16 +126,16 @@ def get_ebel_relation(): from_where_sql = f""" FROM {relation_type}""" if wheres: - from_where_sql += " WHERE " + ' AND '.join(wheres) + from_where_sql += " WHERE " + " AND ".join(wheres) - number_of_results = b.query_get_dict(f"Select count(*) {from_where_sql}")[0]['count'] + number_of_results = b.query_get_dict(f"Select count(*) {from_where_sql}")[0]["count"] sql = f"SELECT {columns} {from_where_sql} LIMIT {p.page_size} SKIP {p.skip}" return { - 'page': p.page, - 'page_size': p.page_size, - 'number_of_results': number_of_results, - 'pages': ceil(number_of_results / p.page_size), - 'results': b.query_get_dict(sql) + "page": p.page, + "page_size": p.page_size, + "number_of_results": number_of_results, + "pages": ceil(number_of_results / p.page_size), + "results": b.query_get_dict(sql), } diff --git a/ebel/web/api/ebel/v1/contradictory_edges.py b/ebel/web/api/ebel/v1/contradictory_edges.py index aa232e3..e3da3b0 100644 --- a/ebel/web/api/ebel/v1/contradictory_edges.py +++ b/ebel/web/api/ebel/v1/contradictory_edges.py @@ -1,11 +1,13 @@ """API queries for collecting contraidctory edges.""" -from ebel import Bel -from flask import request -from ebel.web.api.ebel.v1 import _get_pagination from math import ceil from typing import Dict +from flask import request + +from ebel import Bel +from ebel.web.api.ebel.v1 import _get_pagination + def _get_where_sql(where_dict: Dict[str, str]) -> str: where = "" @@ -18,22 +20,23 @@ def get_contradictory_edges(): """Return list of nodes with a given namespace.""" b = Bel() - subject_class = request.args.get('subject_class', 'bel') - object_class = request.args.get('object_class', 'bel') + subject_class = request.args.get("subject_class", "bel") + object_class = request.args.get("object_class", "bel") sub, obj = {}, {} - sub['namespace'] = request.args.get('subject_namespace') - sub['name'] = request.args.get('subject_name') - obj['namespace'] = request.args.get('object_namespace') - obj['name'] = request.args.get('object_name') + sub["namespace"] = request.args.get("subject_namespace") + sub["name"] = request.args.get("subject_name") + obj["namespace"] = request.args.get("object_namespace") + obj["name"] = request.args.get("object_name") pagination = _get_pagination() - match = f"match {{class:{subject_class}, as:subject_1 {_get_where_sql(sub)} }}" \ - ".outE('directly_increases', 'increases'){as:relation_1}" \ - f".inV(){{class:{object_class}, as:object {_get_where_sql(obj)} }}" \ - ".inE('decreases', 'directly_decreases'){as:relation_2}" \ - f".outV(){{class:{subject_class}, as:subject_2, where:($matched.subject_1=$currentMatch)}}" \ - """ return + match = ( + f"match {{class:{subject_class}, as:subject_1 {_get_where_sql(sub)} }}" + ".outE('directly_increases', 'increases'){as:relation_1}" + f".inV(){{class:{object_class}, as:object {_get_where_sql(obj)} }}" + ".inE('decreases', 'directly_decreases'){as:relation_2}" + f".outV(){{class:{subject_class}, as:subject_2, where:($matched.subject_1=$currentMatch)}}" + """ return subject_1.@rid.asString() as subject_rid, subject_1.@class.asString() as subject_class, subject_1.namespace as subject_namespace, @@ -54,17 +57,18 @@ def get_contradictory_edges(): relation_2.evidence as relation_2_evidence, relation_2.pmid as relation_2_pmid, relation_2.annotation as relation_2_annotation""" + ) - number_of_results = b.execute(f"Select count(*) as number from ({match})")[0].oRecordData['number'] + number_of_results = b.execute(f"Select count(*) as number from ({match})")[0].oRecordData["number"] sql_pagination = f"Select * from ({match}) limit {pagination.page_size} skip {pagination.skip}" results = [x.oRecordData for x in b.execute(sql_pagination)] pages = ceil(number_of_results / pagination.page_size) return { - 'page': pagination.page, - 'page_size': pagination.page_size, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': results + "page": pagination.page, + "page_size": pagination.page_size, + "number_of_results": number_of_results, + "pages": pages, + "results": results, } diff --git a/ebel/web/api/ebel/v1/disgenet.py b/ebel/web/api/ebel/v1/disgenet.py index e6ebad9..71dbab4 100644 --- a/ebel/web/api/ebel/v1/disgenet.py +++ b/ebel/web/api/ebel/v1/disgenet.py @@ -1,10 +1,11 @@ """DisGeNet API methods.""" from flask import request -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import disgenet -from ebel.web.api.ebel.v1 import _get_paginated_query_result, _get_terms_from_model_starts_with, \ - _get_paginated_ebel_query_result +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_paginated_ebel_query_result, + _get_paginated_query_result, + _get_terms_from_model_starts_with) def get_sources(): @@ -16,31 +17,31 @@ def get_sources(): def get_disease_name_starts_with(): """Get entry based on beginning of disease name.""" return _get_terms_from_model_starts_with( - form_field='disease_name', - sa_column=disgenet.DisgenetDisease.disease_name) + form_field="disease_name", sa_column=disgenet.DisgenetDisease.disease_name + ) def get_gene_symbol_starts_with(): """Get entry based on beginning of symbol.""" return _get_terms_from_model_starts_with( - form_field='gene_symbol', - sa_column=disgenet.DisgenetGeneSymbol.gene_symbol) + form_field="gene_symbol", sa_column=disgenet.DisgenetGeneSymbol.gene_symbol + ) def get_gene_disease_pmid_associations(): """Get list of PMIDs with disease genes defined.""" - gene_cols = ['gene_id', 'disease_id', 'pmid'] + gene_cols = ["gene_id", "disease_id", "pmid"] gene_paras = {k: v for k, v in request.args.items() if k in gene_cols and v} query = RDBMS.get_session().query(disgenet.DisgenetGene).filter_by(**gene_paras) - gene_symbol = request.args.get('gene_symbol') + gene_symbol = request.args.get("gene_symbol") if gene_symbol: gs = disgenet.DisgenetGeneSymbol query = query.join(gs).filter(gs.gene_symbol.like(gene_symbol)) - disease_name = request.args.get('disease_name') + disease_name = request.args.get("disease_name") if disease_name: dd = disgenet.DisgenetDisease query = query.join(dd).filter(dd.disease_name.like(disease_name)) - source = request.args.get('source') + source = request.args.get("source") if source: so = disgenet.DisgenetSource query = query.join(so).filter(so.source.like(source)) @@ -50,14 +51,22 @@ def get_gene_disease_pmid_associations(): def get_variant_disease_pmid_associations(): """Get list of PMIDs with disease variants defined.""" - cols = ['snp_id', 'chromosome', 'position', 'disease_id', 'score', 'pmid', 'source_id'] + cols = [ + "snp_id", + "chromosome", + "position", + "disease_id", + "score", + "pmid", + "source_id", + ] paras = {k: v for k, v in request.args.items() if k in cols and v} query = RDBMS.get_session().query(disgenet.DisgenetVariant).filter_by(**paras) - disease_name = request.args.get('disease_name') + disease_name = request.args.get("disease_name") if disease_name: dd = disgenet.DisgenetDisease query = query.join(dd).filter(dd.disease_name.like(disease_name)) - source = request.args.get('source') + source = request.args.get("source") if source: so = disgenet.DisgenetSource query = query.join(so).filter(so.source.like(source)) @@ -67,21 +76,25 @@ def get_variant_disease_pmid_associations(): def get_ebel_has_snp_disgenet(): """Get SNP edge add by e(BE:L) via DiGeNet.""" - edge_class = 'has_snp_disgenet' - - allowed_relations = ['has_snp_disgenet', - 'has_mapped_snp_dgn', - 'has_downstream_snp_dgn', - 'has_upstream_snp_dgn'] - relation = request.args.get('relation') - - where = {'@class': relation if relation in allowed_relations else None, - 'out.name': request.args.get('name'), - 'out.namespace': request.args.get('namespace'), - 'in.rs_number': request.args.get('rs_number')} + edge_class = "has_snp_disgenet" + + allowed_relations = [ + "has_snp_disgenet", + "has_mapped_snp_dgn", + "has_downstream_snp_dgn", + "has_upstream_snp_dgn", + ] + relation = request.args.get("relation") + + where = { + "@class": relation if relation in allowed_relations else None, + "out.name": request.args.get("name"), + "out.namespace": request.args.get("namespace"), + "in.rs_number": request.args.get("rs_number"), + } # non-edge parameter # edge parameter - edge_properties = ['disease_name', 'source', 'pmid'] + edge_properties = ["disease_name", "source", "pmid"] where.update({k: v for k, v in request.args.items() if k in edge_properties}) sql = f"""SELECT diff --git a/ebel/web/api/ebel/v1/drug2pmod.py b/ebel/web/api/ebel/v1/drug2pmod.py index f99942f..837cf19 100644 --- a/ebel/web/api/ebel/v1/drug2pmod.py +++ b/ebel/web/api/ebel/v1/drug2pmod.py @@ -1,14 +1,14 @@ """Custom drug2pmod API methods.""" import re -from collections import namedtuple, defaultdict +from collections import defaultdict, namedtuple from math import ceil from typing import List, Union from flask import request from ebel import Bel -from ebel.web.api.ebel.v1 import OrientDbSqlOperator, DataType +from ebel.web.api.ebel.v1 import DataType, OrientDbSqlOperator SQL_MATCH_TEMPLATE = """ match {{class:drug, as:drug{drug}}} @@ -22,12 +22,14 @@ class Column: """Column class.""" - def __init__(self, - odb_class: str, - form_name: str, - column: str, - sql_operator: OrientDbSqlOperator = OrientDbSqlOperator.EQUALS, - data_type: DataType = DataType.STRING): + def __init__( + self, + odb_class: str, + form_name: str, + column: str, + sql_operator: OrientDbSqlOperator = OrientDbSqlOperator.EQUALS, + data_type: DataType = DataType.STRING, + ): """Init method.""" self.odb_class = odb_class self.column = column @@ -41,11 +43,11 @@ def set_search_term(self, value: Union[str, None]): """Get the value of a given search term.""" if value is not None and value.strip(): self.value = value.strip() - if '%' not in self.value and self.sql_operator == OrientDbSqlOperator.LIKE: + if "%" not in self.value and self.sql_operator == OrientDbSqlOperator.LIKE: self.sql_operator = OrientDbSqlOperator.EQUALS -Pagination = namedtuple('Pagination', ['page', 'page_size', 'skip']) +Pagination = namedtuple("Pagination", ["page", "page_size", "skip"]) class Query: @@ -59,10 +61,10 @@ def __init__(self, columns: List[Column]): def get_pagination(self) -> Pagination: """Return the results as a collection of paginations.""" - page_size = request.args.get('page_size', '10') + page_size = request.args.get("page_size", "10") page_size = int(page_size) if re.search(r"^\d+$", page_size) else 10 page_size = 10 if page_size >= 100 else page_size - page = request.args.get('page', '1') + page = request.args.get("page", "1") page = int(page) if re.search(r"^\d+$", page) else 1 skip = (page - 1) * page_size return Pagination(page=page, page_size=page_size, skip=skip) @@ -73,9 +75,9 @@ def get_wheres_dict(self) -> dict: for col in self.columns: if col.value: - if col.column.endswith('@rid') and "," in col.value: + if col.column.endswith("@rid") and "," in col.value: rids = [x.strip() for x in col.value.split(",") if re.search(r"^#\d+:\d+$", x.strip())] - rids_str = "[" + ','.join(rids) + "]" + rids_str = "[" + ",".join(rids) + "]" wheres[col.odb_class].append(f"{col.column} in {rids_str}") elif col.column != "@class": value = col.value.replace('"', '\\"') @@ -85,15 +87,19 @@ def get_wheres_dict(self) -> dict: else: value = value - if col.data_type in [DataType.LIST_STRING, DataType.LIST_NUMBER, DataType.LIST_INTEGER]: - wheres[col.odb_class].append(f'{value} {col.sql_operator.value} {col.column}') + if col.data_type in [ + DataType.LIST_STRING, + DataType.LIST_NUMBER, + DataType.LIST_INTEGER, + ]: + wheres[col.odb_class].append(f"{value} {col.sql_operator.value} {col.column}") else: - wheres[col.odb_class].append(f'{col.column} {col.sql_operator.value} {value}') + wheres[col.odb_class].append(f"{col.column} {col.sql_operator.value} {value}") - wheres_dict = {odb_class: '' for odb_class in {c.odb_class for c in self.columns}} + wheres_dict = {odb_class: "" for odb_class in {c.odb_class for c in self.columns}} for odb_class, col_queries in wheres.items(): if col_queries: - wheres_dict[odb_class] = ", where:( " + ' AND '.join(col_queries) + " )" + wheres_dict[odb_class] = ", where:( " + " AND ".join(col_queries) + " )" return wheres_dict @@ -105,14 +111,14 @@ def sql(self) -> str: def get_number_of_results(self) -> int: """Count the number of results.""" - nodes_edges = ', '.join({c.odb_class for c in self.columns}) + nodes_edges = ", ".join({c.odb_class for c in self.columns}) sql = f"SELECT count(*) FROM ({self.sql} return {nodes_edges})" print(sql) - return self.ebel.query_get_dict(sql)[0]['count'] + return self.ebel.query_get_dict(sql)[0]["count"] @property def __sql4results(self): - cols = ', '.join([f"{c.odb_class}.{c.display_column} as {c.form_name}" for c in self.columns]) + cols = ", ".join([f"{c.odb_class}.{c.display_column} as {c.form_name}" for c in self.columns]) return self.sql + " return " + cols def get_result(self) -> dict: @@ -128,50 +134,100 @@ def get_result(self) -> dict: results = [x for x in self.ebel.query_get_dict(sql_paginated)] return { - 'page': p.page, - 'page_size': p.page_size, - 'number_of_results': number_of_results, - 'pages': pages, - 'results': results} + "page": p.page, + "page_size": p.page_size, + "number_of_results": number_of_results, + "pages": pages, + "results": results, + } def get_drug2pmod() -> dict: """Create a table of relations and information fulfilling the algorithm.""" columns: List[Column] = [ - Column('drug', 'drug__cas_number', 'cas_number', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__description', 'description', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__drugbank_id', 'drugbank_id', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__indication', 'indication', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__label', 'label', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__mechanism_of_action', 'mechanism_of_action', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__metabolism', 'metabolism', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__pharmacodynamics', 'pharmacodynamics', OrientDbSqlOperator.LIKE), - Column('drug', 'drug__toxicity', 'toxicity', OrientDbSqlOperator.LIKE), - Column('has_drug_target', 'has_drug_target__action', 'action', OrientDbSqlOperator.LIKE), - Column('has_drug_target', 'has_drug_target__known_action', 'known_action', OrientDbSqlOperator.LIKE), - Column('drug_target', 'drug_target__name', 'name', OrientDbSqlOperator.LIKE), - Column('drug_target', 'drug_target__bel', 'bel', OrientDbSqlOperator.LIKE), - Column('drug_target', 'drug_target__label', 'label', OrientDbSqlOperator.LIKE), - Column('drug_target', 'drug_target__uniprot', 'uniprot', OrientDbSqlOperator.LIKE), - Column('drug_target', 'drug_target__reactome_pathways', 'reactome_pathways', OrientDbSqlOperator.IN, - DataType.LIST_STRING), - Column('drug_target_to_target', 'drug_target_to_target__relation', '@class', OrientDbSqlOperator.EQUALS), - Column('drug_target_to_target', 'drug_target_to_target__evidence', 'evidence', OrientDbSqlOperator.LIKE), - Column('drug_target_to_target', 'drug_target_to_target__pmid', 'pmid', OrientDbSqlOperator.LIKE), - Column('drug_target_to_target', 'drug_target_to_target__citation', 'citation', OrientDbSqlOperator.LIKE), - Column('target', 'target__name', 'name', OrientDbSqlOperator.LIKE), - Column('target', 'target__bel', 'bel', OrientDbSqlOperator.LIKE), - Column('target', 'target__label', 'label', OrientDbSqlOperator.LIKE), - Column('target', 'target__uniprot', 'uniprot', OrientDbSqlOperator.LIKE), + Column("drug", "drug__cas_number", "cas_number", OrientDbSqlOperator.LIKE), + Column("drug", "drug__description", "description", OrientDbSqlOperator.LIKE), + Column("drug", "drug__drugbank_id", "drugbank_id", OrientDbSqlOperator.LIKE), + Column("drug", "drug__indication", "indication", OrientDbSqlOperator.LIKE), + Column("drug", "drug__label", "label", OrientDbSqlOperator.LIKE), Column( - 'target', 'target__reactome_pathways', 'reactome_pathways', OrientDbSqlOperator.IN, DataType.LIST_STRING + "drug", + "drug__mechanism_of_action", + "mechanism_of_action", + OrientDbSqlOperator.LIKE, ), - Column('pmod', 'pmod__amino_acid', 'amino_acid', OrientDbSqlOperator.LIKE), - Column('pmod', 'pmod__name', 'name', OrientDbSqlOperator.LIKE), - Column('pmod', 'pmod__namespace', 'namespace', OrientDbSqlOperator.LIKE), - Column('pmod', 'pmod__position', 'position', OrientDbSqlOperator.LIKE), - Column('pmod', 'pmod__type', 'type', OrientDbSqlOperator.LIKE), - Column('pmod', 'pmod__bel', 'bel', OrientDbSqlOperator.LIKE) + Column("drug", "drug__metabolism", "metabolism", OrientDbSqlOperator.LIKE), + Column( + "drug", + "drug__pharmacodynamics", + "pharmacodynamics", + OrientDbSqlOperator.LIKE, + ), + Column("drug", "drug__toxicity", "toxicity", OrientDbSqlOperator.LIKE), + Column( + "has_drug_target", + "has_drug_target__action", + "action", + OrientDbSqlOperator.LIKE, + ), + Column( + "has_drug_target", + "has_drug_target__known_action", + "known_action", + OrientDbSqlOperator.LIKE, + ), + Column("drug_target", "drug_target__name", "name", OrientDbSqlOperator.LIKE), + Column("drug_target", "drug_target__bel", "bel", OrientDbSqlOperator.LIKE), + Column("drug_target", "drug_target__label", "label", OrientDbSqlOperator.LIKE), + Column("drug_target", "drug_target__uniprot", "uniprot", OrientDbSqlOperator.LIKE), + Column( + "drug_target", + "drug_target__reactome_pathways", + "reactome_pathways", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column( + "drug_target_to_target", + "drug_target_to_target__relation", + "@class", + OrientDbSqlOperator.EQUALS, + ), + Column( + "drug_target_to_target", + "drug_target_to_target__evidence", + "evidence", + OrientDbSqlOperator.LIKE, + ), + Column( + "drug_target_to_target", + "drug_target_to_target__pmid", + "pmid", + OrientDbSqlOperator.LIKE, + ), + Column( + "drug_target_to_target", + "drug_target_to_target__citation", + "citation", + OrientDbSqlOperator.LIKE, + ), + Column("target", "target__name", "name", OrientDbSqlOperator.LIKE), + Column("target", "target__bel", "bel", OrientDbSqlOperator.LIKE), + Column("target", "target__label", "label", OrientDbSqlOperator.LIKE), + Column("target", "target__uniprot", "uniprot", OrientDbSqlOperator.LIKE), + Column( + "target", + "target__reactome_pathways", + "reactome_pathways", + OrientDbSqlOperator.IN, + DataType.LIST_STRING, + ), + Column("pmod", "pmod__amino_acid", "amino_acid", OrientDbSqlOperator.LIKE), + Column("pmod", "pmod__name", "name", OrientDbSqlOperator.LIKE), + Column("pmod", "pmod__namespace", "namespace", OrientDbSqlOperator.LIKE), + Column("pmod", "pmod__position", "position", OrientDbSqlOperator.LIKE), + Column("pmod", "pmod__type", "type", OrientDbSqlOperator.LIKE), + Column("pmod", "pmod__bel", "bel", OrientDbSqlOperator.LIKE), ] for column in columns: diff --git a/ebel/web/api/ebel/v1/drugbank.py b/ebel/web/api/ebel/v1/drugbank.py index 1baa5ae..6b55fa9 100644 --- a/ebel/web/api/ebel/v1/drugbank.py +++ b/ebel/web/api/ebel/v1/drugbank.py @@ -2,14 +2,15 @@ from flask import request from sqlalchemy.orm.attributes import InstrumentedAttribute -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import drugbank -from ebel.web.api.ebel.v1 import _get_data, _get_paginated_query_result, _get_paginated_ebel_query_result +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, + _get_paginated_query_result) def get_by_id(): """Get DrugBank entry by ID.""" - drugbank_id = request.args.get('drugbank_id') + drugbank_id = request.args.get("drugbank_id") if drugbank_id: query = RDBMS.get_session().query(drugbank.Drugbank).filter_by(drugbank_id=drugbank_id.strip()) return query.first().as_dict() @@ -27,10 +28,11 @@ def get_interaction(): def _get_model(model): """Get tables connected to Drugbank - paginated.""" - drugbank_id = request.args.get('drugbank_id') + drugbank_id = request.args.get("drugbank_id") columns_like = { - col_obj.like(request.args[col_name]) for col_name, col_obj in model.__dict__.items() - if isinstance(col_obj, InstrumentedAttribute) and col_name in request.args and col_name != 'drugbank_id' + col_obj.like(request.args[col_name]) + for col_name, col_obj in model.__dict__.items() + if isinstance(col_obj, InstrumentedAttribute) and col_name in request.args and col_name != "drugbank_id" } query = RDBMS.get_session().query(model).filter(*columns_like) if drugbank_id: @@ -81,22 +83,22 @@ def get_synonym(): def get_has_drug_target_db(): """Get DrugBank related eBEL relations.""" conf = { - 'target_rid': "in.@rid.asString()", - 'target_name': "in.name", - 'target_namespace': "in.namespace", - 'target_uniprot_accession': "in.uniprot", - 'drugbank_id': "out.drugbank_id", - 'drug_name': "out.label", - 'drug_rid': "out.@rid.asString()", - 'edge_rid': "@rid.asString()", - 'action': "action", - 'known_action': "known_action" + "target_rid": "in.@rid.asString()", + "target_name": "in.name", + "target_namespace": "in.namespace", + "target_uniprot_accession": "in.uniprot", + "drugbank_id": "out.drugbank_id", + "drug_name": "out.label", + "drug_rid": "out.@rid.asString()", + "edge_rid": "@rid.asString()", + "action": "action", + "known_action": "known_action", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_drug_target_db" ra = request.args paras = {k: ra[k] for k in ra if k in conf} if paras: - sql += " WHERE " + ' AND '.join([f'{conf[k]} = "{v}"' for k, v in paras.items()]) + sql += " WHERE " + " AND ".join([f'{conf[k]} = "{v}"' for k, v in paras.items()]) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/ebel.py b/ebel/web/api/ebel/v1/ebel.py index 2ae00d5..cf7366a 100644 --- a/ebel/web/api/ebel/v1/ebel.py +++ b/ebel/web/api/ebel/v1/ebel.py @@ -1,46 +1,55 @@ """Generic e(BE:L) API methods.""" import json +from collections import Counter + import numpy as np import pandas as pd - from flask import request -from collections import Counter from ebel import Bel -from ebel.web.api import RDBMS from ebel.manager.rdbms.models.uniprot import Uniprot +from ebel.web.api import RDBMS def get_bel_edge_statistics_by_uniprot_accession(): """Get edge statatistics by UniProt accession number. 'has__' edges are excluded.""" query_object = json.loads(request.data) - acc = query_object['uniprot_accession'] - eclass = query_object['edge_class'] + acc = query_object["uniprot_accession"] + eclass = query_object["edge_class"] results = {} - for direction in ('out', 'in'): + for direction in ("out", "in"): results[direction] = __get_bel_edge_statistics_by_uniprot_accession(acc, eclass, direction) return results def __get_bel_edge_statistics_by_uniprot_accession(acc, eclass, direction): - sql_direction = {'out': ('out', 'in'), 'in': ('in', 'out')} - - sql = "match {class:protein, where:(uniprot='%s')}.%sE()" % (acc, sql_direction[direction][0]) - sql += "{class:%s, as:e,where:(not @class like 'has__%%')}.%sV(){as:o}" % (eclass, sql_direction[direction][1]) - sql += " return e.@class.asString() as eclass, e.@rid.asString() as erid, o.@class as oclass, " \ - "o.@rid.asString() as orid" + sql_direction = {"out": ("out", "in"), "in": ("in", "out")} + + sql = "match {class:protein, where:(uniprot='%s')}.%sE()" % ( + acc, + sql_direction[direction][0], + ) + sql += "{class:%s, as:e,where:(not @class like 'has__%%')}.%sV(){as:o}" % ( + eclass, + sql_direction[direction][1], + ) + sql += ( + " return e.@class.asString() as eclass, e.@rid.asString() as erid, o.@class as oclass, " + "o.@rid.asString() as orid" + ) rows = [x.oRecordData for x in Bel().client.command(sql)] - edge_counter = Counter([x['eclass'] for x in rows]) + edge_counter = Counter([x["eclass"] for x in rows]) edge_counter_sorted = sorted(edge_counter.items(), key=lambda item: item[1], reverse=True) - edge_counter_ordered = [{'name': x[0], 'value': x[1]} for x in edge_counter_sorted] + edge_counter_ordered = [{"name": x[0], "value": x[1]} for x in edge_counter_sorted] # make object unique - object_rid_class_dict = {x['orid']: x['oclass'] for x in rows} + object_rid_class_dict = {x["orid"]: x["oclass"] for x in rows} counter = Counter(object_rid_class_dict.values()) - object_counter_ordered = [{'name': x[0], 'value': x[1]} for x in - sorted(counter.items(), key=lambda i: i[1], reverse=True)] - return {'edges': edge_counter_ordered, 'objects': object_counter_ordered} + object_counter_ordered = [ + {"name": x[0], "value": x[1]} for x in sorted(counter.items(), key=lambda i: i[1], reverse=True) + ] + return {"edges": edge_counter_ordered, "objects": object_counter_ordered} def _describe(s: pd.Series): @@ -50,9 +59,16 @@ def _describe(s: pd.Series): def get_intact_by_uniprot(): """Get IntAct entries by UniProt ID.""" query_object = json.loads(request.data) - columns = ["name", "recommended_name", "accession", "detection_method", "interaction_type", "confidence_value", - "pmid"] - acc = query_object['uniprot_accession'] + columns = [ + "name", + "recommended_name", + "accession", + "detection_method", + "interaction_type", + "confidence_value", + "pmid", + ] + acc = query_object["uniprot_accession"] sql = f"""(Select u.name, u.recommended_name, i.int_b_uniprot_id, i.detection_method, i.interaction_type, i.confidence_value, CAST(i.pmid AS CHAR) from intact i inner join uniprot u on (u.accession=i.int_b_uniprot_id) where @@ -65,22 +81,26 @@ def get_intact_by_uniprot(): # statistics_name_10 = Counter([x['name'] for x in rows]).most_common(10) r = { - 'rows': df.to_dict('records'), - 'total': df.shape[0], - 'statistics': { - 'confidence_value': _describe(df.confidence_value), - 'name': _describe(df.name), - 'pmid': _describe(df.pmid), - 'detection_method': _describe(df.detection_method), - 'interaction_type': _describe(df.interaction_type), - } + "rows": df.to_dict("records"), + "total": df.shape[0], + "statistics": { + "confidence_value": _describe(df.confidence_value), + "name": _describe(df.name), + "pmid": _describe(df.pmid), + "detection_method": _describe(df.detection_method), + "interaction_type": _describe(df.interaction_type), + }, } return r def find_all(): """Get all UniProt entries.""" - term = json.loads(request.data)['term'].strip() - uniprot_entries = RDBMS.get_session().query(Uniprot.recommended_name).filter( - Uniprot.recommended_name.like("%" + term + "%")).count() + term = json.loads(request.data)["term"].strip() + uniprot_entries = ( + RDBMS.get_session() + .query(Uniprot.recommended_name) + .filter(Uniprot.recommended_name.like("%" + term + "%")) + .count() + ) return uniprot_entries diff --git a/ebel/web/api/ebel/v1/ensembl.py b/ebel/web/api/ebel/v1/ensembl.py index 9629fa1..5420fc0 100644 --- a/ebel/web/api/ebel/v1/ensembl.py +++ b/ebel/web/api/ebel/v1/ensembl.py @@ -1,6 +1,6 @@ """EnsEMBL API methods.""" -from ebel.web.api.ebel.v1 import _get_data from ebel.manager.rdbms.models import ensembl +from ebel.web.api.ebel.v1 import _get_data def get_ensembl(): diff --git a/ebel/web/api/ebel/v1/expression_atlas.py b/ebel/web/api/ebel/v1/expression_atlas.py index cdb0ea4..9302eaa 100644 --- a/ebel/web/api/ebel/v1/expression_atlas.py +++ b/ebel/web/api/ebel/v1/expression_atlas.py @@ -1,31 +1,28 @@ """Expression Atlas API methods.""" +import json import math import re -import json -from ebel.web.api import RDBMS -from . import _get_paginated_query_result, add_query_filters +from collections import Counter -from ebel.web.api.ebel.v1 import _get_data from flask import request from sqlalchemy import inspect -from collections import Counter -from ebel.manager.rdbms.models.expression_atlas import ( - GroupComparison, - Gsea, - FoldChange, - Experiment, - Idf, - SdrfCondensed) from ebel import Bel +from ebel.manager.rdbms.models.expression_atlas import (Experiment, FoldChange, + GroupComparison, Gsea, + Idf, SdrfCondensed) +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import _get_data + +from . import _get_paginated_query_result, add_query_filters models_dict = { - 'experiment': Experiment, - 'idf': Idf, - 'gsea': Gsea, - 'group_comparison': GroupComparison, - 'foldchange': FoldChange, - 'sdrf_condensed': SdrfCondensed + "experiment": Experiment, + "idf": Idf, + "gsea": Gsea, + "group_comparison": GroupComparison, + "foldchange": FoldChange, + "sdrf_condensed": SdrfCondensed, } @@ -62,59 +59,60 @@ def get_gsea(): class GseaType: """String constant definitions.""" - GO = 'go' - REACTOME = 'reactome' - INTERPRO = 'interpro' + GO = "go" + REACTOME = "reactome" + INTERPRO = "interpro" def get_most_common_gseas_by_group_comparison_ids() -> list: """Identify most often ocurring GSEAs based on group comparison IDs.""" b = Bel() - group_comparison_ids_str = request.args.get('group_comparison_ids', '') + group_comparison_ids_str = request.args.get("group_comparison_ids", "") group_comparison_ids = [ int(x.strip()) - for x in group_comparison_ids_str.split(',') + for x in group_comparison_ids_str.split(",") if isinstance(x, str) and re.search(r"\s*\d+\s*", x) ] - query = b.session.query( - Gsea.accession, - Gsea.term, - Gsea.gsea_type - ).filter(Gsea.group_comparison_id.in_(group_comparison_ids)) + query = b.session.query(Gsea.accession, Gsea.term, Gsea.gsea_type).filter( + Gsea.group_comparison_id.in_(group_comparison_ids) + ) - gsea_type = request.args.get('gsea_type', '') - gsea_type = gsea_type if gsea_type in [GseaType.GO, GseaType.REACTOME, GseaType.INTERPRO] else '' + gsea_type = request.args.get("gsea_type", "") + gsea_type = gsea_type if gsea_type in [GseaType.GO, GseaType.REACTOME, GseaType.INTERPRO] else "" if gsea_type: query = query.filter(Gsea.gsea_type == gsea_type) - p_adj_non_dir = request.args.get('p_adj_non_dir', '') - p_adj_non_dir = float(p_adj_non_dir) if re.search(r'\s*\d+(\.\d+)?\s*', p_adj_non_dir) else None + p_adj_non_dir = request.args.get("p_adj_non_dir", "") + p_adj_non_dir = float(p_adj_non_dir) if re.search(r"\s*\d+(\.\d+)?\s*", p_adj_non_dir) else None if isinstance(p_adj_non_dir, float): query = query.filter(Gsea.p_adj_non_dir <= p_adj_non_dir) results = [x for x in query.all()] - return [{'accession': x[0][0], 'term': x[0][1], 'type': x[0][2], 'occurence': x[1]} - for x in Counter(results).most_common() if x[1] > 1] + return [ + {"accession": x[0][0], "term": x[0][1], "type": x[0][2], "occurence": x[1]} + for x in Counter(results).most_common() + if x[1] > 1 + ] def get_comparison_groups_by_genes(): """Get experiments, comparion groups and foldchanges by 2 genes.""" - gene_1 = request.args.get('gene_symbol_1') - gene_2 = request.args.get('gene_symbol_2') + gene_1 = request.args.get("gene_symbol_1") + gene_2 = request.args.get("gene_symbol_2") return _get_comparison_groups_by_genes(gene_1, gene_2) def _get_comparison_groups_by_genes(gene_1, gene_2): """Get experiments, comparion groups and foldchanges by 2 genes.""" result_keys = [ - 'fold_change_gene_symbol_1', - 'fold_change_gene_symbol_2', - 'group_comparison', - 'experiment_title', - 'experiment_identifier', - 'is_positive_correlated' + "fold_change_gene_symbol_1", + "fold_change_gene_symbol_2", + "group_comparison", + "experiment_title", + "experiment_identifier", + "is_positive_correlated", ] sql = f"""Select a.log2foldchange, @@ -142,8 +140,9 @@ def _get_comparison_groups_by_genes(gene_1, gene_2): def get_comparison_groups_by_edge_rid(): """Use edge rIDs to get different groups to compare.""" - rid = request.args.get('edge_rid') - res = Bel().query_get_dict(f"""Select + rid = request.args.get("edge_rid") + res = Bel().query_get_dict( + f"""Select in.name as name_in, in.namespace as ns_in, out.name as name_out, @@ -154,26 +153,27 @@ def get_comparison_groups_by_edge_rid(): citation, annotation, evidence - from {rid}""") + from {rid}""" + ) if res: - has_all_cols = all([(x in res[0]) for x in ['name_in', 'ns_in', 'name_out', 'ns_out']]) + has_all_cols = all([(x in res[0]) for x in ["name_in", "ns_in", "name_out", "ns_out"]]) if has_all_cols: - both_ns_hgnc = res[0]['ns_in'] == 'HGNC' and res[0]['ns_out'] == 'HGNC' - not_the_same = res[0]['name_in'] != res[0]['name_out'] + both_ns_hgnc = res[0]["ns_in"] == "HGNC" and res[0]["ns_out"] == "HGNC" + not_the_same = res[0]["name_in"] != res[0]["name_out"] if has_all_cols and both_ns_hgnc and not_the_same: - gene_symbol_1 = res[0]['name_in'] - gene_symbol_2 = res[0]['name_out'] + gene_symbol_1 = res[0]["name_in"] + gene_symbol_2 = res[0]["name_out"] comparisons = _get_comparison_groups_by_genes(gene_symbol_1, gene_symbol_2) return { - 'gene_symbol_1': gene_symbol_1, - 'gene_symbol_2': gene_symbol_2, - 'citation': res[0].get('citation'), - 'annotation': res[0].get('annotation'), - 'evidence': res[0].get('evidence'), - 'relation': res[0].get('relation'), - 'comparisons': comparisons, - 'bel_object': res[0].get('in_bel'), - 'bel_subject': res[0].get('out_bel'), + "gene_symbol_1": gene_symbol_1, + "gene_symbol_2": gene_symbol_2, + "citation": res[0].get("citation"), + "annotation": res[0].get("annotation"), + "evidence": res[0].get("evidence"), + "relation": res[0].get("relation"), + "comparisons": comparisons, + "bel_object": res[0].get("in_bel"), + "bel_subject": res[0].get("out_bel"), } return [] @@ -185,13 +185,13 @@ def get_expression_atlas(): GroupComparison.name, Experiment.title, GroupComparison.group_comparison, - GroupComparison.id + GroupComparison.id, ] data = json.loads(request.data) b = Bel() query = b.session.query(Experiment).join(GroupComparison) - for table, columns_params in [(k, v) for k, v in data.items() if k not in ('page', 'page_size')]: - any_value = any([v['value'].strip() for k, v in columns_params.items()]) + for table, columns_params in [(k, v) for k, v in data.items() if k not in ("page", "page_size")]: + any_value = any([v["value"].strip() for k, v in columns_params.items()]) if any_value: model = models_dict[table] @@ -200,18 +200,18 @@ def get_expression_atlas(): query = add_query_filters(query, columns_params, model) query = query.with_entities(*columns) query = query.group_by(Experiment.id, GroupComparison.id) - limit = int(data['page_size']) if data.get('page_size') else 10 - page = int(data['page']) if data.get('page') else 1 + limit = int(data["page_size"]) if data.get("page_size") else 10 + page = int(data["page"]) if data.get("page") else 1 count = query.count() query = query.limit(limit).offset(limit * (page - 1)) print(query.statement.compile(compile_kwargs={"literal_binds": True})) column_names = [f"{x.parent.class_.__tablename__}.{x.name}" for x in [inspect(c) for c in columns]] return { - 'page': page, - 'pages': math.ceil(count / limit), - 'count': count, - 'page_size': limit, - 'column_names': column_names, - 'results': [tuple(x) for x in query.all()] + "page": page, + "pages": math.ceil(count / limit), + "count": count, + "page_size": limit, + "column_names": column_names, + "results": [tuple(x) for x in query.all()], } diff --git a/ebel/web/api/ebel/v1/gwas_catalog.py b/ebel/web/api/ebel/v1/gwas_catalog.py index 130fc70..0c56e54 100644 --- a/ebel/web/api/ebel/v1/gwas_catalog.py +++ b/ebel/web/api/ebel/v1/gwas_catalog.py @@ -12,31 +12,33 @@ def get_gwas_catalog(): def get_ebel_relation(): """Get GWAS Catalog related eBEL relations.""" - allowed_relations = ['has_snp_gwascatalog', - 'has_mapped_snp_gc', - 'has_downstream_snp_gc', - 'has_upstream_snp_gc'] - relation = request.args.get('relation') + allowed_relations = [ + "has_snp_gwascatalog", + "has_mapped_snp_gc", + "has_downstream_snp_gc", + "has_upstream_snp_gc", + ] + relation = request.args.get("relation") relation = relation if relation in allowed_relations else allowed_relations[0] conf = { - 'snp_rid': "in.@rid.asString()", - 'namespace': "out.namespace", - 'name': "out.name", - 'gene_rid': "out.@rid.asString()", - 'pubmed_id': "pubmed_id", - 'disease_trait': "disease_trait", - 'rs_number': "in.rs_number", - 'edge_rid': "@rid.asString()" + "snp_rid": "in.@rid.asString()", + "namespace": "out.namespace", + "name": "out.name", + "gene_rid": "out.@rid.asString()", + "pubmed_id": "pubmed_id", + "disease_trait": "disease_trait", + "rs_number": "in.rs_number", + "edge_rid": "@rid.asString()", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += f" FROM {relation}" ra = request.args paras = {k: ra[k] for k in ra if k in conf} if paras: - sql += " WHERE " + ' AND '.join([f'{conf[k]} like "{v}"' for k, v in paras.items()]) + sql += " WHERE " + " AND ".join([f'{conf[k]} like "{v}"' for k, v in paras.items()]) return _get_paginated_ebel_query_result(sql, print_sql=True) diff --git a/ebel/web/api/ebel/v1/hgnc.py b/ebel/web/api/ebel/v1/hgnc.py index b2fcd03..80ff615 100644 --- a/ebel/web/api/ebel/v1/hgnc.py +++ b/ebel/web/api/ebel/v1/hgnc.py @@ -1,14 +1,14 @@ """HGNC API methods.""" from flask import request -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import hgnc +from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data def get_by_symbol(): """Get paginated list of HGNC entries by given gene symbol.""" - symbol = request.args.get('symbol') + symbol = request.args.get("symbol") if symbol: db_entry = RDBMS.get_session().query(hgnc.Hgnc).filter_by(symbol=symbol).first() if db_entry: @@ -17,7 +17,7 @@ def get_by_symbol(): def get_uniprot_accession_by_hgnc_symbol(): """Return UniProt accession number by HGCN gene symbol.""" - symbol = request.args.get('symbol') + symbol = request.args.get("symbol") if symbol: db_entry = RDBMS.get_session().query(hgnc.Hgnc).filter_by(symbol=symbol).first() if db_entry: diff --git a/ebel/web/api/ebel/v1/intact.py b/ebel/web/api/ebel/v1/intact.py index b264d97..f625920 100644 --- a/ebel/web/api/ebel/v1/intact.py +++ b/ebel/web/api/ebel/v1/intact.py @@ -1,11 +1,12 @@ """IntAct API methods.""" -from sqlalchemy import or_ from flask.globals import request +from sqlalchemy import or_ -from ebel.web.api import RDBMS -from ebel.manager.rdbms.models.intact import Intact from ebel.manager.orientdb.odb_structure import intact_edges -from ebel.web.api.ebel.v1 import _get_data, _get_paginated_query_result, _get_paginated_ebel_query_result +from ebel.manager.rdbms.models.intact import Intact +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, + _get_paginated_query_result) def get_intact(): @@ -15,7 +16,7 @@ def get_intact(): def get_by_uniprot(): """Get IntAct entry by UniProt ID.""" - ua = request.args.get('uniprot_accession') + ua = request.args.get("uniprot_accession") if ua: a = Intact.int_a_uniprot_id b = Intact.int_a_uniprot_id @@ -25,27 +26,29 @@ def get_by_uniprot(): def get_ebel_relation(): """Get IntAct related eBEL relations.""" - has_ppi_ia_edge = [x for x in intact_edges if x.name == 'has_ppi_ia'][0] + has_ppi_ia_edge = [x for x in intact_edges if x.name == "has_ppi_ia"][0] conf = {x.prop_name: x.prop_name for x in has_ppi_ia_edge.props} - conf.update({ - 'relation_type': "@class", - 'edge_id': "@rid.asString()", - 'interactor_a_rid': "out.@rid.asString()", - 'interactor_a_name': "out.name", - 'interactor_a_namespace': "out.namespace", - 'interactor_a_bel': "out.bel", - 'interactor_b_rid': "in.@rid.asString()", - 'interactor_b_namespace': "in.namespace", - 'interactor_b_name': "in.name", - 'interactor_b_bel': "in.bel", - }) + conf.update( + { + "relation_type": "@class", + "edge_id": "@rid.asString()", + "interactor_a_rid": "out.@rid.asString()", + "interactor_a_name": "out.name", + "interactor_a_namespace": "out.namespace", + "interactor_a_bel": "out.bel", + "interactor_b_rid": "in.@rid.asString()", + "interactor_b_namespace": "in.namespace", + "interactor_b_name": "in.name", + "interactor_b_bel": "in.bel", + } + ) sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_ppi_ia" ra = request.args paras = {k: ra[k] for k in ra if k in conf} if paras: - sql += " WHERE " + ' AND '.join([f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()]) + sql += " WHERE " + " AND ".join([f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()]) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/iuphar.py b/ebel/web/api/ebel/v1/iuphar.py index cb47748..3600680 100644 --- a/ebel/web/api/ebel/v1/iuphar.py +++ b/ebel/web/api/ebel/v1/iuphar.py @@ -1,9 +1,9 @@ """IUPHAR API methods.""" from flask.globals import request -from ebel.manager.orientdb.odb_structure import iuphar_edges -from ebel.web.api import RDBMS +from ebel.manager.orientdb.odb_structure import iuphar_edges from ebel.manager.rdbms.models import iuphar +from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data, _get_paginated_ebel_query_result @@ -14,7 +14,7 @@ def get_interaction(): def get_ligandby_by_id(): """Get IUPHAR ligand entry by ID.""" - ligand_id = request.args.get('id') + ligand_id = request.args.get("id") return RDBMS.get_session().query(iuphar.IupharLigand).get(ligand_id).as_dict() @@ -30,25 +30,27 @@ def get_interaction_by_target_gene_symbol(): def get_ebel_relation(): """Get Iuphar related eBEL relations.""" - as_iuphar_edge = [x for x in iuphar_edges if x.name == 'iuphar_interaction'][0] + as_iuphar_edge = [x for x in iuphar_edges if x.name == "iuphar_interaction"][0] conf = {x.prop_name: x.prop_name for x in as_iuphar_edge.props} - conf.update({ - 'edge_id': "@rid.asString()", - 'interactor_a_rid': "out.@rid.asString()", - 'interactor_a_name': "out.name", - 'interactor_a_namespace': "out.namespace", - 'interactor_b_rid': "in.@rid.asString()", - 'interactor_b_namespace': "in.namespace", - 'interactor_b_name': "in.name", - 'relation': "@class.asString()" - }) + conf.update( + { + "edge_id": "@rid.asString()", + "interactor_a_rid": "out.@rid.asString()", + "interactor_a_name": "out.name", + "interactor_a_namespace": "out.namespace", + "interactor_b_rid": "in.@rid.asString()", + "interactor_b_namespace": "in.namespace", + "interactor_b_name": "in.name", + "relation": "@class.asString()", + } + ) sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM iuphar_interaction" ra = request.args paras = {k: ra[k] for k in ra if k in conf} - pmid = request.args.get('pmid') + pmid = request.args.get("pmid") if paras or pmid: wheres = [] if paras: @@ -56,6 +58,6 @@ def get_ebel_relation(): if pmid: wheres += [f"{pmid} in pmids"] - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/kegg.py b/ebel/web/api/ebel/v1/kegg.py index dbd6a79..e0bd768 100644 --- a/ebel/web/api/ebel/v1/kegg.py +++ b/ebel/web/api/ebel/v1/kegg.py @@ -1,11 +1,12 @@ """KEGG API methods.""" -from sqlalchemy import or_ from flask.globals import request +from sqlalchemy import or_ -from ebel.web.api import RDBMS from ebel.manager.rdbms.models.kegg import Kegg -from ebel.web.api.ebel.v1 import _get_data, _get_paginated_query_result, _get_paginated_ebel_query_result +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, + _get_paginated_query_result) def get_kegg(): @@ -15,7 +16,7 @@ def get_kegg(): def get_by_gene_symbol(): """Get KEGG entry by gene symbol.""" - symbol = request.args.get('gene_symbol') + symbol = request.args.get("gene_symbol") query_filter = or_(Kegg.gene_symbol_a == symbol, Kegg.gene_symbol_b == symbol) query = RDBMS.get_session().query(Kegg).filter(query_filter) return _get_paginated_query_result(query) @@ -24,26 +25,26 @@ def get_by_gene_symbol(): def get_ebel_relation(): """Get KEGG related eBEL relations.""" conf = { - 'interaction_type': 'interaction_type', - 'pathway_names': "pathway_name", - 'edge_id': "@rid.asString()", - 'interactor_a_rid': "out.@rid.asString()", - 'interactor_a_name': "out.name", - 'interactor_a_namespace': "out.namespace", - 'interactor_a_bel': "out.bel", - 'interactor_b_rid': "in.@rid.asString()", - 'interactor_b_namespace': "in.namespace", - 'interactor_b_name': "in.name", - 'interactor_b_bel': "in.bel", - 'relation': "@class.asString()", + "interaction_type": "interaction_type", + "pathway_names": "pathway_name", + "edge_id": "@rid.asString()", + "interactor_a_rid": "out.@rid.asString()", + "interactor_a_name": "out.name", + "interactor_a_namespace": "out.namespace", + "interactor_a_bel": "out.bel", + "interactor_b_rid": "in.@rid.asString()", + "interactor_b_namespace": "in.namespace", + "interactor_b_name": "in.name", + "interactor_b_bel": "in.bel", + "relation": "@class.asString()", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_ppi_kg" ra = request.args paras = {k: ra[k] for k in ra if k in conf} - pathway_name = request.args.get('pathway_name') + pathway_name = request.args.get("pathway_name") if paras or pathway_name: wheres = [] if paras: @@ -51,6 +52,6 @@ def get_ebel_relation(): if pathway_name: wheres += [f'"{pathway_name}" in pathway_name'] - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/mirtarbase.py b/ebel/web/api/ebel/v1/mirtarbase.py index 76c9650..9068d7b 100644 --- a/ebel/web/api/ebel/v1/mirtarbase.py +++ b/ebel/web/api/ebel/v1/mirtarbase.py @@ -14,22 +14,22 @@ def get_mirtarbase(): def get_ebel_relation(): """Get miRTarBase related eBEL relations.""" conf = { - 'mirbase_id': "out.name", - 'target_rna_symbol': "in.name", - 'target_namespace': "in.namespace", - 'support_type': "support_type", - 'pmid': "pmid", - 'experiments': "experiments" + "mirbase_id": "out.name", + "target_rna_symbol": "in.name", + "target_namespace": "in.namespace", + "support_type": "support_type", + "pmid": "pmid", + "experiments": "experiments", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_mirgene_target" ra = request.args wheres = [] - experiment = ra.get('experiment') + experiment = ra.get("experiment") if experiment: wheres.append(f'"{experiment}" in experiments') @@ -38,6 +38,6 @@ def get_ebel_relation(): wheres += [f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()] if wheres: - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/ncbi.py b/ebel/web/api/ebel/v1/ncbi.py index 4db4c84..1d96a2c 100644 --- a/ebel/web/api/ebel/v1/ncbi.py +++ b/ebel/web/api/ebel/v1/ncbi.py @@ -2,8 +2,8 @@ from flask import request -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import ncbi +from ebel.web.api import RDBMS from ebel.web.api.ebel.v1 import _get_data, _get_paginated_query_result @@ -36,11 +36,11 @@ def get_medgen_by_pmid(): """Get MedGen terms by PMID.""" n = ncbi.NcbiMedGenName p = ncbi.NcbiMedGenPmid - q = RDBMS.get_session().query(n.cui, n.name).join(p).filter_by(pmid=request.args.get('pmid')) + q = RDBMS.get_session().query(n.cui, n.name).join(p).filter_by(pmid=request.args.get("pmid")) return _get_paginated_query_result(q, return_dict=True) def get_go_by_pmid(): """Get gene ontology by PMID.""" - q = RDBMS.get_session().query(ncbi.NcbiGeneGo).join(ncbi.NcbiGeneGoPmid).filter_by(pmid=request.args.get('pmid')) + q = RDBMS.get_session().query(ncbi.NcbiGeneGo).join(ncbi.NcbiGeneGoPmid).filter_by(pmid=request.args.get("pmid")) return _get_paginated_query_result(q, print_sql=True) diff --git a/ebel/web/api/ebel/v1/nsides.py b/ebel/web/api/ebel/v1/nsides.py index ab6394e..72c1714 100644 --- a/ebel/web/api/ebel/v1/nsides.py +++ b/ebel/web/api/ebel/v1/nsides.py @@ -14,20 +14,20 @@ def get_nsides(): def get_ebel_relation(): """Get NSIDES related eBEL relations.""" conf = { - 'drugbank_id': "out.drugbank_id", - 'drug_label': "out.label", - 'ppr': "prr", - 'mean_reporting_frequency': "mean_reporting_frequency", - 'condition_meddra_id': "in.condition_meddra_id", - 'side_effect': "in.label", + "drugbank_id": "out.drugbank_id", + "drug_label": "out.label", + "ppr": "prr", + "mean_reporting_frequency": "mean_reporting_frequency", + "condition_meddra_id": "in.condition_meddra_id", + "side_effect": "in.label", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_side_effect" paras = {k: request.args[k] for k in request.args if k in conf} if paras: wheres = [f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()] - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/pathway_commons.py b/ebel/web/api/ebel/v1/pathway_commons.py index e1d391d..3f5d1f4 100644 --- a/ebel/web/api/ebel/v1/pathway_commons.py +++ b/ebel/web/api/ebel/v1/pathway_commons.py @@ -3,10 +3,12 @@ from flask import request from sqlalchemy import or_ +from ebel.manager.rdbms.models.pathway_commons import ( + PathwayCommons, PathwayName, Pmid, pathway_commons__pathway_name) from ebel.web.api import RDBMS -from ebel.manager.rdbms.models.pathway_commons import PathwayCommons, PathwayName, pathway_commons__pathway_name, Pmid -from ebel.web.api.ebel.v1 import _get_data, _get_paginated_ebel_query_result, _get_paginated_query_result, \ - _get_terms_from_model_starts_with +from ebel.web.api.ebel.v1 import (_get_data, _get_paginated_ebel_query_result, + _get_paginated_query_result, + _get_terms_from_model_starts_with) def get_pathway_commons(): @@ -16,67 +18,76 @@ def get_pathway_commons(): def get_by_gene_symbol(): """Get entries by gene symbol.""" - gene_symbol = request.args.get('gene_symbol') - query = RDBMS.get_session().query(PathwayCommons).filter( - or_(PathwayCommons.participant_a == gene_symbol, PathwayCommons.participant_b == gene_symbol)) + gene_symbol = request.args.get("gene_symbol") + query = ( + RDBMS.get_session() + .query(PathwayCommons) + .filter( + or_( + PathwayCommons.participant_a == gene_symbol, + PathwayCommons.participant_b == gene_symbol, + ) + ) + ) return _get_paginated_query_result(query) def get_by_pathway_name(): """Get entries by pathway name.""" - pathway_name = request.args.get('pathway_name') - query = RDBMS.get_session().query(PathwayCommons) \ - .join(pathway_commons__pathway_name) \ - .join(PathwayName) \ + pathway_name = request.args.get("pathway_name") + query = ( + RDBMS.get_session() + .query(PathwayCommons) + .join(pathway_commons__pathway_name) + .join(PathwayName) .filter_by(name=pathway_name) + ) return _get_paginated_query_result(query) def get_by_pmid(): """Get entries by PMID.""" - pmid = request.args.get('pmid') - query = RDBMS.get_session().query(PathwayCommons) \ - .join(Pmid) \ - .filter_by(pmid=pmid) + pmid = request.args.get("pmid") + query = RDBMS.get_session().query(PathwayCommons).join(Pmid).filter_by(pmid=pmid) return _get_paginated_query_result(query) def get_pathway_name_starts_with(): """Get entries where pathway starts with given value.""" - return _get_terms_from_model_starts_with('pathway_name', PathwayName.name) + return _get_terms_from_model_starts_with("pathway_name", PathwayName.name) def get_ebel_relation(): """Get Pathway Commons related eBEL relations.""" conf = { - 'interactor_a_rid': "out.rid.asString()", - 'interactor_a_namespace': "out.namespace", - 'interactor_a_name': "out.name", - 'edge_rid': "rid.asString()", - 'relation_type': "type", - 'sources': "sources", - 'pmids': "pmids", - 'pathways': "pathways.name", - 'interactor_b_rid': "in.rid.asString()", - 'interactor_b_namespace': "in.namespace", - 'interactor_b_name': "in.name", + "interactor_a_rid": "out.rid.asString()", + "interactor_a_namespace": "out.namespace", + "interactor_a_name": "out.name", + "edge_rid": "rid.asString()", + "relation_type": "type", + "sources": "sources", + "pmids": "pmids", + "pathways": "pathways.name", + "interactor_b_rid": "in.rid.asString()", + "interactor_b_namespace": "in.namespace", + "interactor_b_name": "in.name", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_action_pc" ra = request.args wheres = [] - source = ra.get('source') + source = ra.get("source") if source: wheres.append(f'"{source}" in sources') - pmid = ra.get('pmid') + pmid = ra.get("pmid") if source: - wheres.append(f'{pmid} in pmids') + wheres.append(f"{pmid} in pmids") - pathway = ra.get('pathway') + pathway = ra.get("pathway") if pathway: wheres.append(f'"{pathway}" in pathways.name') @@ -85,6 +96,6 @@ def get_ebel_relation(): wheres += [f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()] if wheres: - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/protein_atlas.py b/ebel/web/api/ebel/v1/protein_atlas.py index cf8ec7a..c4398f4 100644 --- a/ebel/web/api/ebel/v1/protein_atlas.py +++ b/ebel/web/api/ebel/v1/protein_atlas.py @@ -1,6 +1,6 @@ """Protein Atlas API methods.""" -from ebel.web.api.ebel.v1 import _get_data from ebel.manager.rdbms.models import protein_atlas +from ebel.web.api.ebel.v1 import _get_data def get_rna_brain_fantom(): diff --git a/ebel/web/api/ebel/v1/reactome.py b/ebel/web/api/ebel/v1/reactome.py index cdf4c68..3621eca 100644 --- a/ebel/web/api/ebel/v1/reactome.py +++ b/ebel/web/api/ebel/v1/reactome.py @@ -13,8 +13,8 @@ def get_reactome(): def get_bel_node_by_pathway_name(): """Get Reactome related eBEL nodes by pathway name.""" - pathway_name = request.args.get('pathway_name') - sql = f'''SELECT + pathway_name = request.args.get("pathway_name") + sql = f"""SELECT @rid.asString() as rid, namespace, name, @@ -25,5 +25,5 @@ def get_bel_node_by_pathway_name(): WHERE pure=true AND "{pathway_name}" in reactome_pathways - ''' + """ return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/stringdb.py b/ebel/web/api/ebel/v1/stringdb.py index 3b91b26..c48ef38 100644 --- a/ebel/web/api/ebel/v1/stringdb.py +++ b/ebel/web/api/ebel/v1/stringdb.py @@ -19,19 +19,19 @@ def get_stringdb_action(): def get_ebel_relation(): """Get StringDB related eBEL relations.""" conf = { - 'interactor_a_rid': "out.@rid.asString()", - 'interactor_a_name': "out.name", - 'interactor_a_namespace': "out.namespace", - 'interactor_a_uniprot': "out.uniprot", - 'edge_id': "@rid.asString()", - 'relation': "@class.asString()", - 'interactor_b_rid': "in.@rid.asString()", - 'interactor_b_namespace': "in.namespace", - 'interactor_b_name': "in.name", - 'interactor_b_uniprot': "in.uniprot", + "interactor_a_rid": "out.@rid.asString()", + "interactor_a_name": "out.name", + "interactor_a_namespace": "out.namespace", + "interactor_a_uniprot": "out.uniprot", + "edge_id": "@rid.asString()", + "relation": "@class.asString()", + "interactor_b_rid": "in.@rid.asString()", + "interactor_b_namespace": "in.namespace", + "interactor_b_name": "in.name", + "interactor_b_uniprot": "in.uniprot", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM has_action_st" ra = request.args @@ -39,6 +39,6 @@ def get_ebel_relation(): if paras: wheres = [f'{conf[k].replace(".asString()","")} like "{v}"' for k, v in paras.items()] - sql += " WHERE " + ' AND '.join(wheres) + sql += " WHERE " + " AND ".join(wheres) return _get_paginated_ebel_query_result(sql) diff --git a/ebel/web/api/ebel/v1/uniprot.py b/ebel/web/api/ebel/v1/uniprot.py index 785d83b..0f231e5 100644 --- a/ebel/web/api/ebel/v1/uniprot.py +++ b/ebel/web/api/ebel/v1/uniprot.py @@ -1,18 +1,22 @@ """UniProt API methods.""" -from flask import request import json -from . import add_query_filters + +from flask import request + from ebel import Bel -from ebel.web.api import RDBMS from ebel.manager.rdbms.models import uniprot -from ebel.web.api.ebel.v1 import _get_paginated_query_result, _get_terms_from_model_starts_with +from ebel.web.api import RDBMS +from ebel.web.api.ebel.v1 import (_get_paginated_query_result, + _get_terms_from_model_starts_with) + +from . import add_query_filters model_by_tablename = {x.__tablename__: x for x in uniprot.Base.__subclasses__()} nm_tables = { - 'uniprot_keyword': uniprot.uniprot__uniprot_keyword, - 'uniprot_host': uniprot.uniprot__uniprot_host, - 'uniprot_xref': uniprot.uniprot__uniprot_xref, - 'uniprot_subcellular_location': uniprot.uniprot__uniprot_subcellular_location + "uniprot_keyword": uniprot.uniprot__uniprot_keyword, + "uniprot_host": uniprot.uniprot__uniprot_host, + "uniprot_xref": uniprot.uniprot__uniprot_xref, + "uniprot_subcellular_location": uniprot.uniprot__uniprot_subcellular_location, } @@ -26,24 +30,22 @@ def get_uniprot_advanced(): uniprot.Function.description, up.recommended_name, uniprot.GeneSymbol.symbol, - uniprot.Organism.scientific_name + uniprot.Organism.scientific_name, ) - query = RDBMS.get_session().query(up).outerjoin( - uniprot.Function - ).outerjoin( - uniprot.GeneSymbol - ).outerjoin( - uniprot.Organism + query = ( + RDBMS.get_session() + .query(up) + .outerjoin(uniprot.Function) + .outerjoin(uniprot.GeneSymbol) + .outerjoin(uniprot.Organism) ) already_joined_models = (up, uniprot.Function, uniprot.GeneSymbol, uniprot.Organism) for table, columns in data.items(): - - if table not in ('page', 'page_size', 'number_of_results'): + if table not in ("page", "page_size", "number_of_results"): model = model_by_tablename.get(table) - values_exists = any([x.get('value') for x in columns.values()]) + values_exists = any([x.get("value") for x in columns.values()]) if model and model not in already_joined_models and values_exists: - if table in nm_tables: query = query.outerjoin(nm_tables[table]) @@ -60,97 +62,99 @@ def get_uniprot(): """Get paginated list of UniProt entries.""" q = RDBMS.get_session().query(uniprot.Uniprot) - accession = request.args.get('accession') + accession = request.args.get("accession") q = q.filter_by(accession=accession) if accession else q - taxid = request.args.get('taxonomy_id') + taxid = request.args.get("taxonomy_id") q = q.filter_by(taxid=taxid) if taxid else q - gene_symbol = request.args.get('gene_symbol') + gene_symbol = request.args.get("gene_symbol") if gene_symbol: q = q.join(uniprot.GeneSymbol).filter_by(symbol=gene_symbol) - keyword = request.args.get('keyword') + keyword = request.args.get("keyword") if keyword: - q = q.join(uniprot.uniprot__uniprot_keyword) \ - .join(uniprot.Keyword).filter_by(keyword_name=keyword) + q = q.join(uniprot.uniprot__uniprot_keyword).join(uniprot.Keyword).filter_by(keyword_name=keyword) - xref_db = request.args.get('xref_db') - xref_id = request.args.get('xref_id') + xref_db = request.args.get("xref_db") + xref_id = request.args.get("xref_id") if xref_db or xref_id: q = q.join(uniprot.uniprot__uniprot_xref).join(uniprot.Xref) q = q.filter_by(db=xref_db) if xref_db else q q = q.filter_by(identifier=xref_id) if xref_id else q - subcellular_location = request.args.get('subcellular_location') + subcellular_location = request.args.get("subcellular_location") if subcellular_location: - q = q.join(uniprot.uniprot__uniprot_subcellular_location) \ - .join(uniprot.SubcellularLocation).filter_by(name=subcellular_location) + q = ( + q.join(uniprot.uniprot__uniprot_subcellular_location) + .join(uniprot.SubcellularLocation) + .filter_by(name=subcellular_location) + ) return _get_paginated_query_result(q) def get_keyword_starts_with(): """Get entries where keyword starts with given value.""" - return _get_terms_from_model_starts_with('keyword', uniprot.Keyword.keyword_name) + return _get_terms_from_model_starts_with("keyword", uniprot.Keyword.keyword_name) def get_subcellular_location_starts_with(): """Get entries where subcellular location starts with given value.""" - return _get_terms_from_model_starts_with('subcellular_location', uniprot.SubcellularLocation.name) + return _get_terms_from_model_starts_with("subcellular_location", uniprot.SubcellularLocation.name) def get_gene_symbol_starts_with(): """Get entries where symbol starts with given value.""" - return _get_terms_from_model_starts_with('gene_symbol', uniprot.GeneSymbol.symbol) + return _get_terms_from_model_starts_with("gene_symbol", uniprot.GeneSymbol.symbol) def get_gene_starts_with(): """Get entries where gene starts with given value.""" - return _get_terms_from_model_starts_with('gene', uniprot.Gene.name) + return _get_terms_from_model_starts_with("gene", uniprot.Gene.name) def get_organism_starts_with(): """Get entries where organism starts with given value.""" - return _get_terms_from_model_starts_with('organism', uniprot.Organism.scientific_name) + return _get_terms_from_model_starts_with("organism", uniprot.Organism.scientific_name) def get_function_starts_with(): """Get entries where description starts with given value.""" - return _get_terms_from_model_starts_with('description', uniprot.Function.description) + return _get_terms_from_model_starts_with("description", uniprot.Function.description) def get_bel_node_uniprot(): """Get UniProt related eBEL nodes.""" b = Bel() conf = { - 'rid': "@rid.asString()", - 'name': "name", - 'namespace': "namespace", - 'bel': "bel", - 'uniprot_accession': "uniprot" + "rid": "@rid.asString()", + "name": "name", + "namespace": "namespace", + "bel": "bel", + "uniprot_accession": "uniprot", } sql = "SELECT " - sql += ', '.join([f"{v} as {k}" for k, v in conf.items()]) + sql += ", ".join([f"{v} as {k}" for k, v in conf.items()]) sql += " FROM protein WHERE pure = true" - uniprot = request.args.get('accession') + uniprot = request.args.get("accession") if uniprot: sql += f' AND uniprot = "{uniprot}"' else: - return {'error': "uniprot_accession is required."} + return {"error": "uniprot_accession is required."} entries = b.query_get_dict(sql) if entries: result_dict = entries[0] - result_dict['edges'] = {} - for direction in ('in', 'out'): + result_dict["edges"] = {} + for direction in ("in", "out"): match = "match {class:protein, where:(uniprot='" match += uniprot + "' and pure=true)}." + direction + "E(bel_relation){as:e}" match += " return e.@rid, e.@class as relation" sql_edges = f"""Select relation, count(*) from ({match}) group by relation order by count desc""" - result_dict['edges'][direction] = b.query_get_dict(sql_edges) + result_dict["edges"][direction] = b.query_get_dict(sql_edges) return result_dict diff --git a/ebel/web/app.py b/ebel/web/app.py index e4e51ff..145dd1d 100644 --- a/ebel/web/app.py +++ b/ebel/web/app.py @@ -1,16 +1,21 @@ """API methods.""" -import connexion import webbrowser +import connexion from flask_cors import CORS application = connexion.FlaskApp(__name__) -application.add_api('openapi.yml') +application.add_api("openapi.yml") CORS(application.app, expose_headers=["Content-Disposition"]) -def run(host: str = '0.0.0.0', port: int = 5000, debug_mode: bool = True, open_browser: bool = False): +def run( + host: str = "0.0.0.0", + port: int = 5000, + debug_mode: bool = True, + open_browser: bool = False, +): """Run the API server. Parameters @@ -24,12 +29,12 @@ def run(host: str = '0.0.0.0', port: int = 5000, debug_mode: bool = True, open_b open_browser: bool If True, automatically opens browser to API UI. """ - url = f'http://{host}:{port}/ui' + url = f"http://{host}:{port}/ui" if open_browser: webbrowser.open(url) - print(f'Starting web server {url}') + print(f"Starting web server {url}") application.run(host=host, port=port, debug=debug_mode) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/ebel/web/openapi.yml b/ebel/web/openapi.yml index 56b3e20..eb41e01 100644 --- a/ebel/web/openapi.yml +++ b/ebel/web/openapi.yml @@ -3405,10 +3405,6 @@ paths: in: query schema: type: string - - name: intermediate_filament_db - in: query - schema: - type: string - name: iuphar in: query schema: @@ -3437,10 +3433,6 @@ paths: in: query schema: type: string - - name: mamit_trnadb - in: query - schema: - type: string - name: merops in: query schema: @@ -3457,10 +3449,6 @@ paths: in: query schema: type: string - - name: pseudogene_org - in: query - schema: - type: string - name: snornabase in: query schema: diff --git a/mkdocs.yml b/mkdocs.yml index 46bc08d..8be399d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,16 +4,15 @@ site_description: Creation and enrichment of BEL networks. site_author: Bruce Schultz repo_url: https://github.com/e-bel/ebel -#edit_uri: blob/master/docs/ theme: readthedocs extra: - version: 1.0.25 + version: 1.0.37 nav: +# - Home: index.md - Home: index.md -# - Getting Started: ../README.rst - User Guide: - API: api.md - CLI: cli.md @@ -22,26 +21,20 @@ nav: markdown_extensions: - mkdocs-click -copyright: Copyright © 2022 +copyright: Copyright © 2023 plugins: - search - render_swagger -# - redirects: -# redirect_maps: -# user-guide/plugins.md: dev-guide/plugins.md -# user-guide/custom-themes.md: dev-guide/themes.md -# user-guide/styling-your-docs.md: user-guide/choosing-your-theme.md - autorefs - mkdocstrings: + default_handler: python handlers: python: + paths: [ebel] options: docstring_section_style: list members_order: source show_root_heading: true show_source: false show_signature_annotations: true - -#watch: -# - mkdocs diff --git a/neo4j-import.py b/neo4j-import.py new file mode 100644 index 0000000..517243b --- /dev/null +++ b/neo4j-import.py @@ -0,0 +1,22 @@ + +from pathlib import Path +from ebel.manager.neo4j.n4j_meta import Neo4jClient +from ebel.manager.neo4j.bel import Neo4jBel + +sherpa = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/abstract/sherpa.json" +gpt4 = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/abstract/gpt4.json" +gpt35 = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/abstract/gpt35.json" +tau = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/abstract/tau_modified.json" + +# sherpa = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/full-text/sherpa.json" +# gpt4 = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/full-text/gpt4.json" +# gpt35 = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/full-text/gpt35.json" +# tau = "C:/Users/nbabaiha/Documents/GitHub/chatgpt-paper/neo4j-json-files/full-text/tau_modified.json" + +neo = Neo4jClient(uri="bolt://localhost:7687", database="neo4j", user="neo4j", password="12345678") +#print("test", neo.session.run("MATCH (n) RETURN n").data()) + +n4jbel = Neo4jBel(client=neo) +n4jbel.import_json([sherpa, tau, gpt35, gpt4], update_from_protein2gene=False) +#n4jbel.import_json([tau_KG, sherpa_json, common_triples_gpt4_json, common_triples_gpt3]) +print("Done") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..030a032 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,120 @@ +[build-system] +requires = ["poetry_core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "ebel" +version = "1.0.37" +description = "e(BE:L) - validation and extension of BEL networks." +authors = [ + "Bruce Schultz ", + "Christian Ebeling ", +] +maintainers = ["Christian Ebeling "] +license = "MIT" +repository = "https://github.com/e-bel/ebel" +readme = "README.rst" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +keywords = ["Biomedical Database", "Biological Expression Lnaguage", "Graph Database"] +packages = [ + { include = "ebel" }, + { include = "tests", format = "sdist" }, +] + +[tool.poetry.urls] +Issues = 'https://github.com/e-bel/ebel/issues' +Documentation = 'https://ebel.readthedocs.io/en/latest/' + +[tool.poetry.dependencies] +lark-parser = "^0.11.2" +click = "^7.1.2" +requests = "^2.25.1" +tqdm = "^4.59.0" +pandas = "^1.2.4" +sqlalchemy = "^1.4.46" +SQLAlchemy-Utils = "^0.37.7" +xlwt = "^1.3.0" +xlrd = "^2.0.1" +xlsxwriter = "^1.3.8" +xmltodict = "^0.12.0" +GitPython = "^3.1.14" +lxml = "^4.6.5" +flask = "^2.0.1" +flask_cors = "^3.0.10" +connexion = {version = "^2.14.1", extras = ["swagger-ui"]} +cryptography = "^3.4.7" +openpyxl = "^3.0.10" +graphviz = "0.20" +pyorientdb = "^1.0.0" +PyMySQL = "^1.0.2" +python = "^3.9" +mkdocstrings = {version = "^0.18", extras = ["python"]} + +[tool.poetry.group.dev.dependencies] +black = "^23.7.0" + +[tool.poetry.scripts] +ebel = "ebel.cli:main" + +[tool.poetry.extras] +docs = ["sphinx", "sphinx-rtd-theme", "sphinx-click", "sphinx-autodoc-typehints"] + +[tool.black] +line-length = 119 +target-version = ['py39', 'py310', 'py311'] + +[tool.coverage.run] +branch = true +source = ["ebel"] +omit = [ + # omit main + "ebel/__main__.py", + # omit CLI + "ebel/cli.py", + # omit tests and docs + "tests/*", + "docs/*", + ] + +[tool.coverage.report] +# Regexes for lines to exclude from consideration +exclude_also = [ + # Don't complain about missing debug-only code: + "def __repr__", + "if self\\.debug", + + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + + # Don't complain if non-runnable code isn't run: + "if 0:", + "if __name__ == .__main__.:", + + # Don't complain about abstract methods, they aren't run: + "@(abc\\.)?abstractmethod", + ] + +ignore_errors = true + +[tool.coverage.paths] +source = [ + "ebel/", + ".tox/*/lib/python*/site-packages/ebel", + ] + +[tool.coverage.html] +directory = "coverage_html_report" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1f5c1aa..f00f628 100755 --- a/requirements.txt +++ b/requirements.txt @@ -18,5 +18,4 @@ connexion[swagger-ui]==2.14.1 cryptography==3.4.7 openpyxl==3.0.7 graphviz -pyorientdb -neo4j \ No newline at end of file +pyorientdb \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 1032d0a..28305a8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ [metadata] name = ebel -version = 1.0.25 +version = 1.0.26 description = e(BE:L) - validation and extension of BEL networks long_description = file: README.rst long_description_content_type = text/x-rst diff --git a/tests/test_api/test_ebel/conftest.py b/tests/test_api/test_ebel/conftest.py index a94841e..bc8fe7f 100644 --- a/tests/test_api/test_ebel/conftest.py +++ b/tests/test_api/test_ebel/conftest.py @@ -1,10 +1,10 @@ """Init API unit tests.""" import json -import pytest - from typing import Union +import pytest + from ebel.web.app import application diff --git a/tests/test_api/test_ebel/constants.py b/tests/test_api/test_ebel/constants.py index 7b791c9..64e6940 100644 --- a/tests/test_api/test_ebel/constants.py +++ b/tests/test_api/test_ebel/constants.py @@ -1,6 +1,6 @@ """String constants for API unit tests.""" -NUM_RESULTS = 'number_of_results' +NUM_RESULTS = "number_of_results" PAGE = "page" PAGE_SIZE = "page_size" RESULTS = "results" diff --git a/tests/test_api/test_ebel/test_bel.py b/tests/test_api/test_ebel/test_bel.py index 67ff771..05b8e35 100644 --- a/tests/test_api/test_ebel/test_bel.py +++ b/tests/test_api/test_ebel/test_bel.py @@ -1,7 +1,7 @@ """Generic BEL API unit tests.""" -from .constants import RESULTS from .conftest import format_response_data +from .constants import RESULTS class TestBel: @@ -9,35 +9,35 @@ class TestBel: def test_get_edge(self, client): response = client.get( - 'api/v1/bel/edges?subject_node_class=protein&subject_namespace=HGNC&subject_name=IL4&relation=increases&citation_full_journal_name=Molecular%20neurodegeneration&citation_pub_date=2018-06-12&citation_pub_year=2018&citation_last_author=Landreth%20GE&citation_type=PubMed&title=TREM2%20in%20Neurodegenerative%20Diseases.&doi=10.1186%2Fs13024-017-0197-5&object_node_class=protein&object_namespace=HGNC&object_name=TREM2&page_size=10&page=1', - content_type='application/json' + "api/v1/bel/edges?subject_node_class=protein&subject_namespace=HGNC&subject_name=IL4&relation=increases&citation_full_journal_name=Molecular%20neurodegeneration&citation_pub_date=2018-06-12&citation_pub_year=2018&citation_last_author=Landreth%20GE&citation_type=PubMed&title=TREM2%20in%20Neurodegenerative%20Diseases.&doi=10.1186%2Fs13024-017-0197-5&object_node_class=protein&object_namespace=HGNC&object_name=TREM2&page_size=10&page=1", + content_type="application/json", ) cols = [ - 'subject_rid', - 'subject_node_class', - 'subject_namespace', - 'subject_name', - 'subject_bel', - 'subject_gene_symbol_involved_in', - 'subject_other_involved_in', - 'edge_rid', - 'relation', - 'evidence', - 'citation_full_journal_name', - 'citation_pub_date', - 'citation_pub_year', - 'citation_last_author', - 'citation_type', - 'author_in_author_list', - 'title', - 'doi', - 'object_rid', - 'object_node_class', - 'object_namespace', - 'object_name', - 'object_bel', - 'object_gene_symbol_involved_in', - 'object_other_involved_in', + "subject_rid", + "subject_node_class", + "subject_namespace", + "subject_name", + "subject_bel", + "subject_gene_symbol_involved_in", + "subject_other_involved_in", + "edge_rid", + "relation", + "evidence", + "citation_full_journal_name", + "citation_pub_date", + "citation_pub_year", + "citation_last_author", + "citation_type", + "author_in_author_list", + "title", + "doi", + "object_rid", + "object_node_class", + "object_namespace", + "object_name", + "object_bel", + "object_gene_symbol_involved_in", + "object_other_involved_in", ] output = format_response_data(response) @@ -48,8 +48,8 @@ def test_get_edge(self, client): hit = results[0] for col in cols: assert col in hit - assert hit['object_name'] == "IL2" - assert hit['object_namespace'] == "HGNC" - assert hit['object_node_class'] == "protein" - assert hit['citation_last_author'] == "Schellenberg GD" + assert hit["object_name"] == "IL2" + assert hit["object_namespace"] == "HGNC" + assert hit["object_node_class"] == "protein" + assert hit["citation_last_author"] == "Schellenberg GD" assert hit["relation"] == "increases" diff --git a/tests/test_api/test_ebel/test_biogrid.py b/tests/test_api/test_ebel/test_biogrid.py index c6a4407..8f87d9c 100644 --- a/tests/test_api/test_ebel/test_biogrid.py +++ b/tests/test_api/test_ebel/test_biogrid.py @@ -1,7 +1,7 @@ """BioGRID API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestBiogrid: @@ -13,44 +13,31 @@ def test_get_has_ppi_bg_by_symbol_taxid(self, client): pass def test_get_sources(self, client): - response = client.get( - 'api/v1/biogrid/sources', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/sources", content_type="application/json") output = format_response_data(response) - expected_results = { - "BIOGRID": 1, - "FLYBASE": 4, - "POMBASE": 2, - "WORMBASE": 3 - } + expected_results = {"BIOGRID": 1, "FLYBASE": 4, "POMBASE": 2, "WORMBASE": 3} assert isinstance(output, dict) assert output == expected_results def test_get_has_ppi_bg_by_uniprot(self, client): response = client.get( - 'api/v1/ebel/biogrid/modification/by_uniprot?uniprot=P10636', - content_type='application/json' + "api/v1/ebel/biogrid/modification/by_uniprot?uniprot=P10636", content_type="application/json" ) output = format_response_data(response) example_result = { # Used to quickly getting cols - "biogrid_ids": [ - 2623252 - ], + "biogrid_ids": [2623252], "modification": "Proteolytic Processing", "object_label": "Caspase-7", "object_name": "CASP7", "object_namespace": "HGNC", "object_taxonomy_id": 9606, "object_uniprot": "P55210", - "pmids": [ - 12888622 - ], + "pmids": [12888622], "subject_label": "Microtubule-associated protein tau", "subject_name": "MAPT", "subject_namespace": "HGNC", "subject_taxonomy_id": 9606, - "subject_uniprot": "P10636" + "subject_uniprot": "P10636", } assert len(output) > 4 assert isinstance(output, list) @@ -67,326 +54,287 @@ def test_get_has_ppi_bg_by_uniprot(self, client): assert hit[key] == val def test_get_biogrid_by_biogrid_id(self, client): - response = client.get( - 'api/v1/biogrid/by_biogrid_id/2658248', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/by_biogrid_id/2658248", content_type="application/json") output = format_response_data(response) expected_results = { - "biogrid_a": { - "entrez": 59272, - "symbol": "ACE2", - "systematic_name": "UNQ868/PRO1885", - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 + "biogrid_a": { + "entrez": 59272, + "symbol": "ACE2", + "systematic_name": "UNQ868/PRO1885", + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": None, + "uniprot": "Q9BYF1", }, - "trembl": None, - "uniprot": "Q9BYF1" - }, - "biogrid_b": { - "entrez": 43740568, - "symbol": "S", - "systematic_name": "GU280_gp02", - "taxonomy": { - "organism_name": "Severe acute respiratory syndrome coronavirus 2", - "taxonomy_id": 2697049 + "biogrid_b": { + "entrez": 43740568, + "symbol": "S", + "systematic_name": "GU280_gp02", + "taxonomy": { + "organism_name": "Severe acute respiratory syndrome coronavirus 2", + "taxonomy_id": 2697049, + }, + "trembl": None, + "uniprot": "P0DTC2", + }, + "biogrid_id": 2658248, + "experimental_system": { + "experimental_system": "Reconstituted Complex", + "experimental_system_type": "physical", + "frequency": 58466, }, - "trembl": None, - "uniprot": "P0DTC2" - }, - "biogrid_id": 2658248, - "experimental_system": { - "experimental_system": "Reconstituted Complex", - "experimental_system_type": "physical", - "frequency": 58466 - }, - "modification": None, - "publication": { - "author_name": "Sun C ", - "publication_year": 2020, - "source": "DOI", - "source_identifier": "10.1101/2020.02.16.951723" - }, - "qualification": "The Receptor Binding Domain (RBD) of the SARS-CoV-2 Spike protein interacts with human ACE2 in vitro.", - "score": None, - "source": "BIOGRID", - "throughput": { - "frequency": 366952, - "throughput": "Low Throughput" - } + "modification": None, + "publication": { + "author_name": "Sun C ", + "publication_year": 2020, + "source": "DOI", + "source_identifier": "10.1101/2020.02.16.951723", + }, + "qualification": "The Receptor Binding Domain (RBD) of the SARS-CoV-2 Spike protein interacts with human ACE2 in vitro.", + "score": None, + "source": "BIOGRID", + "throughput": {"frequency": 366952, "throughput": "Low Throughput"}, } assert isinstance(output, dict) assert output == expected_results def test_get_biogrid_by_biogrid_id_using_post(self, client): response = client.post( - 'api/v1/biogrid/by_biogrid_id', + "api/v1/biogrid/by_biogrid_id", json={"biogrid_id": 2658248}, ) output = format_response_data(response) expected_results = { - "biogrid_a": { - "entrez": 59272, - "symbol": "ACE2", - "systematic_name": "UNQ868/PRO1885", - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 + "biogrid_a": { + "entrez": 59272, + "symbol": "ACE2", + "systematic_name": "UNQ868/PRO1885", + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": None, + "uniprot": "Q9BYF1", + }, + "biogrid_b": { + "entrez": 43740568, + "symbol": "S", + "systematic_name": "GU280_gp02", + "taxonomy": { + "organism_name": "Severe acute respiratory syndrome coronavirus 2", + "taxonomy_id": 2697049, + }, + "trembl": None, + "uniprot": "P0DTC2", }, - "trembl": None, - "uniprot": "Q9BYF1" - }, - "biogrid_b": { - "entrez": 43740568, - "symbol": "S", - "systematic_name": "GU280_gp02", - "taxonomy": { - "organism_name": "Severe acute respiratory syndrome coronavirus 2", - "taxonomy_id": 2697049 + "biogrid_id": 2658248, + "experimental_system": { + "experimental_system": "Reconstituted Complex", + "experimental_system_type": "physical", + "frequency": 58466, + }, + "modification": None, + "publication": { + "author_name": "Sun C ", + "publication_year": 2020, + "source": "DOI", + "source_identifier": "10.1101/2020.02.16.951723", }, - "trembl": None, - "uniprot": "P0DTC2" - }, - "biogrid_id": 2658248, - "experimental_system": { - "experimental_system": "Reconstituted Complex", - "experimental_system_type": "physical", - "frequency": 58466 - }, - "modification": None, - "publication": { - "author_name": "Sun C ", - "publication_year": 2020, - "source": "DOI", - "source_identifier": "10.1101/2020.02.16.951723" - }, - "qualification": "The Receptor Binding Domain (RBD) of the SARS-CoV-2 Spike protein interacts with human ACE2 in vitro.", - "score": None, - "source": "BIOGRID", - "throughput": { - "frequency": 366952, - "throughput": "Low Throughput" - } + "qualification": "The Receptor Binding Domain (RBD) of the SARS-CoV-2 Spike protein interacts with human ACE2 in vitro.", + "score": None, + "source": "BIOGRID", + "throughput": {"frequency": 366952, "throughput": "Low Throughput"}, } assert isinstance(output, dict) assert output == expected_results def test_get_experimental_systems(self, client): - response = client.get( - 'api/v1/biogrid/experimental_systems', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/experimental_systems", content_type="application/json") output = format_response_data(response) expected_results = { - "Affinity Capture-Luminescence": 24, - "Affinity Capture-MS": 2, - "Affinity Capture-RNA": 9, - "Affinity Capture-Western": 5, - "Biochemical Activity": 12, - "Co-crystal Structure": 21, - "Co-fractionation": 6, - "Co-localization": 20, - "Co-purification": 19, - "Dosage Growth Defect": 26, - "Dosage Lethality": 25, - "Dosage Rescue": 18, - "FRET": 22, - "Far Western": 27, - "Negative Genetic": 1, - "PCA": 11, - "Phenotypic Enhancement": 15, - "Phenotypic Suppression": 14, - "Positive Genetic": 4, - "Protein-RNA": 17, - "Protein-peptide": 23, - "Proximity Label-MS": 7, - "Reconstituted Complex": 8, - "Synthetic Growth Defect": 10, - "Synthetic Haploinsufficiency": 28, - "Synthetic Lethality": 13, - "Synthetic Rescue": 16, - "Two-hybrid": 3 + "Affinity Capture-Luminescence": 24, + "Affinity Capture-MS": 2, + "Affinity Capture-RNA": 9, + "Affinity Capture-Western": 5, + "Biochemical Activity": 12, + "Co-crystal Structure": 21, + "Co-fractionation": 6, + "Co-localization": 20, + "Co-purification": 19, + "Dosage Growth Defect": 26, + "Dosage Lethality": 25, + "Dosage Rescue": 18, + "FRET": 22, + "Far Western": 27, + "Negative Genetic": 1, + "PCA": 11, + "Phenotypic Enhancement": 15, + "Phenotypic Suppression": 14, + "Positive Genetic": 4, + "Protein-RNA": 17, + "Protein-peptide": 23, + "Proximity Label-MS": 7, + "Reconstituted Complex": 8, + "Synthetic Growth Defect": 10, + "Synthetic Haploinsufficiency": 28, + "Synthetic Lethality": 13, + "Synthetic Rescue": 16, + "Two-hybrid": 3, } assert isinstance(output, dict) assert output == expected_results def test_get_taxonomies(self, client): - response = client.get( - 'api/v1/biogrid/taxonomies', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/taxonomies", content_type="application/json") output = format_response_data(response) expected_results = { - "Anopheles gambiae (PEST)": 180454, - "Apis mellifera": 7460, - "Arabidopsis thaliana (Columbia)": 3702, - "Bacillus subtilis (168)": 224308, - "Bos taurus": 9913, - "Caenorhabditis elegans": 6239, - "Candida albicans (SC5314)": 237561, - "Canis familiaris": 9615, - "Cavia porcellus": 10141, - "Chlamydomonas reinhardtii": 3055, - "Chlorocebus sabaeus": 60711, - "Cricetulus griseus": 10029, - "Danio rerio": 7955, - "Dictyostelium discoideum (AX4)": 352472, - "Drosophila melanogaster": 7227, - "Emericella nidulans (FGSC A4)": 227321, - "Equus caballus": 9796, - "Escherichia coli (K12)": 83333, - "Escherichia coli (K12/MC4100/BW2952)": 595496, - "Escherichia coli (K12/MG1655)": 511145, - "Escherichia coli (K12/W3110)": 316407, - "Felis Catus": 9685, - "Gallus gallus": 9031, - "Glycine max": 3847, - "Hepatitus C Virus": 11103, - "Homo sapiens": 9606, - "Human Herpesvirus 1": 10298, - "Human Herpesvirus 2": 10310, - "Human Herpesvirus 3": 10335, - "Human Herpesvirus 4": 10376, - "Human Herpesvirus 5": 10359, - "Human Herpesvirus 6A": 32603, - "Human Herpesvirus 6B": 32604, - "Human Herpesvirus 7": 10372, - "Human Herpesvirus 8": 37296, - "Human Immunodeficiency Virus 1": 11676, - "Human Immunodeficiency Virus 2": 11709, - "Human papillomavirus (10)": 333759, - "Human papillomavirus (16)": 333760, - "Human papillomavirus (32)": 333763, - "Human papillomavirus (5)": 333923, - "Human papillomavirus (6b)": 10600, - "Human papillomavirus (7)": 10620, - "Human papillomavirus (9)": 10621, - "Leishmania major (Friedlin)": 347515, - "Macaca mulatta": 9544, - "Meleagris gallopavo": 9103, - "Middle-East Respiratory Syndrome-related Coronavirus": 1335626, - "Monodelphis domestica": 13616, - "Mus musculus": 10090, - "Mycobacterium tuberculosis (H37Rv)": 83332, - "Neurospora crassa (OR74A)": 367110, - "Nicotiana tomentosiformis": 4098, - "Oryctolagus cuniculus": 9986, - "Oryza sativa (Japonica)": 39947, - "Ovis aries": 9940, - "Pan troglodytes": 9598, - "Pediculus humanus": 121224, - "Plasmodium falciparum (3D7)": 36329, - "Rattus norvegicus": 10116, - "Ricinus communis": 3988, - "Saccharomyces cerevisiae (S288c)": 559292, - "Schizosaccharomyces pombe (972h)": 284812, - "Selaginella moellendorffii": 88036, - "Severe acute respiratory syndrome coronavirus 2": 2697049, - "Severe acute respiratory syndrome-related coronavirus": 694009, - "Simian Immunodeficiency Virus": 11723, - "Simian Virus 40": 10633, - "Solanum lycopersicum": 4081, - "Solanum tuberosum": 4113, - "Sorghum bicolor": 4558, - "Streptococcus pneumoniae (ATCCBAA255)": 171101, - "Strongylocentrotus purpuratus": 7668, - "Sus scrofa": 9823, - "Tobacco Mosaic Virus": 12242, - "Ustilago maydis (521)": 237631, - "Vaccinia Virus": 10245, - "Vitis vinifera": 29760, - "Xenopus laevis": 8355, - "Zea mays": 4577 + "Anopheles gambiae (PEST)": 180454, + "Apis mellifera": 7460, + "Arabidopsis thaliana (Columbia)": 3702, + "Bacillus subtilis (168)": 224308, + "Bos taurus": 9913, + "Caenorhabditis elegans": 6239, + "Candida albicans (SC5314)": 237561, + "Canis familiaris": 9615, + "Cavia porcellus": 10141, + "Chlamydomonas reinhardtii": 3055, + "Chlorocebus sabaeus": 60711, + "Cricetulus griseus": 10029, + "Danio rerio": 7955, + "Dictyostelium discoideum (AX4)": 352472, + "Drosophila melanogaster": 7227, + "Emericella nidulans (FGSC A4)": 227321, + "Equus caballus": 9796, + "Escherichia coli (K12)": 83333, + "Escherichia coli (K12/MC4100/BW2952)": 595496, + "Escherichia coli (K12/MG1655)": 511145, + "Escherichia coli (K12/W3110)": 316407, + "Felis Catus": 9685, + "Gallus gallus": 9031, + "Glycine max": 3847, + "Hepatitus C Virus": 11103, + "Homo sapiens": 9606, + "Human Herpesvirus 1": 10298, + "Human Herpesvirus 2": 10310, + "Human Herpesvirus 3": 10335, + "Human Herpesvirus 4": 10376, + "Human Herpesvirus 5": 10359, + "Human Herpesvirus 6A": 32603, + "Human Herpesvirus 6B": 32604, + "Human Herpesvirus 7": 10372, + "Human Herpesvirus 8": 37296, + "Human Immunodeficiency Virus 1": 11676, + "Human Immunodeficiency Virus 2": 11709, + "Human papillomavirus (10)": 333759, + "Human papillomavirus (16)": 333760, + "Human papillomavirus (32)": 333763, + "Human papillomavirus (5)": 333923, + "Human papillomavirus (6b)": 10600, + "Human papillomavirus (7)": 10620, + "Human papillomavirus (9)": 10621, + "Leishmania major (Friedlin)": 347515, + "Macaca mulatta": 9544, + "Meleagris gallopavo": 9103, + "Middle-East Respiratory Syndrome-related Coronavirus": 1335626, + "Monodelphis domestica": 13616, + "Mus musculus": 10090, + "Mycobacterium tuberculosis (H37Rv)": 83332, + "Neurospora crassa (OR74A)": 367110, + "Nicotiana tomentosiformis": 4098, + "Oryctolagus cuniculus": 9986, + "Oryza sativa (Japonica)": 39947, + "Ovis aries": 9940, + "Pan troglodytes": 9598, + "Pediculus humanus": 121224, + "Plasmodium falciparum (3D7)": 36329, + "Rattus norvegicus": 10116, + "Ricinus communis": 3988, + "Saccharomyces cerevisiae (S288c)": 559292, + "Schizosaccharomyces pombe (972h)": 284812, + "Selaginella moellendorffii": 88036, + "Severe acute respiratory syndrome coronavirus 2": 2697049, + "Severe acute respiratory syndrome-related coronavirus": 694009, + "Simian Immunodeficiency Virus": 11723, + "Simian Virus 40": 10633, + "Solanum lycopersicum": 4081, + "Solanum tuberosum": 4113, + "Sorghum bicolor": 4558, + "Streptococcus pneumoniae (ATCCBAA255)": 171101, + "Strongylocentrotus purpuratus": 7668, + "Sus scrofa": 9823, + "Tobacco Mosaic Virus": 12242, + "Ustilago maydis (521)": 237631, + "Vaccinia Virus": 10245, + "Vitis vinifera": 29760, + "Xenopus laevis": 8355, + "Zea mays": 4577, } assert isinstance(output, dict) assert output == expected_results def test_get_modifications(self, client): - response = client.get( - 'api/v1/biogrid/modifications', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/modifications", content_type="application/json") output = format_response_data(response) expected_results = { - "Acetylation": 4, - "Deacetylation": 10, - "Demethylation": 14, - "Deneddylation": 15, - "Dephosphorylation": 9, - "Desumoylation": 13, - "Deubiquitination": 5, - "FAT10ylation": 19, - "Glycosylation": 17, - "Methylation": 7, - "Nedd(Rub1)ylation": 11, - "Neddylation": 18, - "No Modification": 3, - "Phosphorylation": 1, - "Prenylation": 16, - "Proteolytic Processing": 6, - "Ribosylation": 12, - "Sumoylation": 8, - "Ubiquitination": 2, - "de-ISGylation": 20 + "Acetylation": 4, + "Deacetylation": 10, + "Demethylation": 14, + "Deneddylation": 15, + "Dephosphorylation": 9, + "Desumoylation": 13, + "Deubiquitination": 5, + "FAT10ylation": 19, + "Glycosylation": 17, + "Methylation": 7, + "Nedd(Rub1)ylation": 11, + "Neddylation": 18, + "No Modification": 3, + "Phosphorylation": 1, + "Prenylation": 16, + "Proteolytic Processing": 6, + "Ribosylation": 12, + "Sumoylation": 8, + "Ubiquitination": 2, + "de-ISGylation": 20, } assert isinstance(output, dict) assert output == expected_results def test_get_biogrid_by_pmid(self, client): - response = client.get( - 'api/v1/biogrid/by_pmid/21685874', - content_type='application/json' - ) + response = client.get("api/v1/biogrid/by_pmid/21685874", content_type="application/json") output = format_response_data(response) expected_results = { "biogrid_a": { - "entrez": 853167, - "symbol": "GCN5", - "systematic_name": "YGR252W", - "taxonomy": { - "organism_name": "Saccharomyces cerevisiae (S288c)", - "taxonomy_id": 559292 - }, - "trembl": None, - "uniprot": "Q03330" + "entrez": 853167, + "symbol": "GCN5", + "systematic_name": "YGR252W", + "taxonomy": {"organism_name": "Saccharomyces cerevisiae (S288c)", "taxonomy_id": 559292}, + "trembl": None, + "uniprot": "Q03330", }, "biogrid_b": { - "entrez": 852295, - "symbol": "HHT1", - "systematic_name": "YBR010W", - "taxonomy": { - "organism_name": "Saccharomyces cerevisiae (S288c)", - "taxonomy_id": 559292 - }, - "trembl": None, - "uniprot": "P61830" + "entrez": 852295, + "symbol": "HHT1", + "systematic_name": "YBR010W", + "taxonomy": {"organism_name": "Saccharomyces cerevisiae (S288c)", "taxonomy_id": 559292}, + "trembl": None, + "uniprot": "P61830", }, "biogrid_id": 543927, "experimental_system": { - "experimental_system": "Biochemical Activity", - "experimental_system_type": "physical", - "frequency": 24247 - }, - "modification": { - "frequency": 846, - "modification": "Acetylation" + "experimental_system": "Biochemical Activity", + "experimental_system_type": "physical", + "frequency": 24247, }, + "modification": {"frequency": 846, "modification": "Acetylation"}, "publication": { - "author_name": "Bian C ", - "publication_year": 2011, - "source": "PUBMED", - "source_identifier": "21685874" + "author_name": "Bian C ", + "publication_year": 2011, + "source": "PUBMED", + "source_identifier": "21685874", }, "qualification": "the purified SAGA complex can acetylate histone 3 in vitro", "score": None, "source": "BIOGRID", - "throughput": { - "frequency": 1611845, - "throughput": "High Throughput" - } - } + "throughput": {"frequency": 1611845, "throughput": "High Throughput"}, + } assert isinstance(output, list) hit = output[0] assert isinstance(hit, dict) @@ -394,56 +342,44 @@ def test_get_biogrid_by_pmid(self, client): def test_get_biogrid(self, client): response = client.get( - 'api/v1/biogrid?id_type_a=symbol&interactor_a=MAPT&taxonomy_id_a=9606&id_type_b=symbol&taxonomy_id_b=9606&modification=Acetylation&page_size=10&page=1', - content_type='application/json' + "api/v1/biogrid?id_type_a=symbol&interactor_a=MAPT&taxonomy_id_a=9606&id_type_b=symbol&taxonomy_id_b=9606&modification=Acetylation&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "biogrid_a": { - "entrez": 2033, - "symbol": "EP300", - "systematic_name": "RP1-85F18.1", - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 + "biogrid_a": { + "entrez": 2033, + "symbol": "EP300", + "systematic_name": "RP1-85F18.1", + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": "Q7Z6C1", + "uniprot": "Q09472", }, - "trembl": "Q7Z6C1", - "uniprot": "Q09472" - }, - "biogrid_b": { - "entrez": 4137, - "symbol": "MAPT", - "systematic_name": None, - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 + "biogrid_b": { + "entrez": 4137, + "symbol": "MAPT", + "systematic_name": None, + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": "B3KTM0", + "uniprot": "P10636", + }, + "biogrid_id": 558818, + "experimental_system": { + "experimental_system": "Biochemical Activity", + "experimental_system_type": "physical", + "frequency": 24247, + }, + "modification": {"frequency": 846, "modification": "Acetylation"}, + "publication": { + "author_name": "Min SW ", + "publication_year": 2010, + "source": "PUBMED", + "source_identifier": "20869593", }, - "trembl": "B3KTM0", - "uniprot": "P10636" - }, - "biogrid_id": 558818, - "experimental_system": { - "experimental_system": "Biochemical Activity", - "experimental_system_type": "physical", - "frequency": 24247 - }, - "modification": { - "frequency": 846, - "modification": "Acetylation" - }, - "publication": { - "author_name": "Min SW ", - "publication_year": 2010, - "source": "PUBMED", - "source_identifier": "20869593" - }, - "qualification": None, - "score": None, - "source": "BIOGRID", - "throughput": { - "frequency": 366952, - "throughput": "Low Throughput" - } + "qualification": None, + "score": None, + "source": "BIOGRID", + "throughput": {"frequency": 366952, "throughput": "Low Throughput"}, } assert output[NUM_RESULTS] == 4 assert output[PAGE_SIZE] == 10 @@ -455,59 +391,49 @@ def test_get_biogrid(self, client): def test_get_biogrid_using_post(self, client): response = client.post( - 'api/v1/biogrid', - json={ - "experimental_system_id": 2, - "page": 2, - "page_size": 3 - }, + "api/v1/biogrid", + json={"experimental_system_id": 2, "page": 2, "page_size": 3}, follow_redirects=True, ) output = format_response_data(response) expected_results = { - "biogrid_a": { - "entrez": 1489671, - "symbol": "E", - "systematic_name": "SARS-CoV-4", - "taxonomy": { - "organism_name": "Severe acute respiratory syndrome-related coronavirus", - "taxonomy_id": 694009 + "biogrid_a": { + "entrez": 1489671, + "symbol": "E", + "systematic_name": "SARS-CoV-4", + "taxonomy": { + "organism_name": "Severe acute respiratory syndrome-related coronavirus", + "taxonomy_id": 694009, + }, + "trembl": None, + "uniprot": "P59637", }, - "trembl": None, - "uniprot": "P59637" - }, - "biogrid_b": { - "entrez": 83871, - "symbol": "RAB34", - "systematic_name": None, - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 + "biogrid_b": { + "entrez": 83871, + "symbol": "RAB34", + "systematic_name": None, + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": "B4DNC0", + "uniprot": "Q9BZG1", }, - "trembl": "B4DNC0", - "uniprot": "Q9BZG1" - }, - "biogrid_id": 2754699, - "experimental_system": { - "experimental_system": "Affinity Capture-MS", - "experimental_system_type": "physical", - "frequency": 459921 - }, - "modification": None, - "publication": { - "author_name": "Stukalov A ", - "publication_year": 2020, - "source": "DOI", - "source_identifier": "10.1101/2020.06.17.156455" - }, - "qualification": "Affinity capture-MS was carried out using HA-tagged viral proteins as baits in human A549 lung carcinoma cells. Significant interactions were identified as those where the prey protein was at least 4 times enriched against the background (median log2 score > 2) with a p-value <= 1E-3.", - "score": 8.15622, - "source": "BIOGRID", - "throughput": { - "frequency": 1611845, - "throughput": "High Throughput" - } + "biogrid_id": 2754699, + "experimental_system": { + "experimental_system": "Affinity Capture-MS", + "experimental_system_type": "physical", + "frequency": 459921, + }, + "modification": None, + "publication": { + "author_name": "Stukalov A ", + "publication_year": 2020, + "source": "DOI", + "source_identifier": "10.1101/2020.06.17.156455", + }, + "qualification": "Affinity capture-MS was carried out using HA-tagged viral proteins as baits in human A549 lung carcinoma cells. Significant interactions were identified as those where the prey protein was at least 4 times enriched against the background (median log2 score > 2) with a p-value <= 1E-3.", + "score": 8.15622, + "source": "BIOGRID", + "throughput": {"frequency": 1611845, "throughput": "High Throughput"}, } assert output[NUM_RESULTS] > 459000 assert output[PAGE_SIZE] == 3 @@ -519,20 +445,17 @@ def test_get_biogrid_using_post(self, client): def test_get_interactor_by_symbol_starts_with(self, client): response = client.get( - 'api/v1/biogrid/interactor/by_symbol_starts_with?symbol=trem&taxonomy_id=9606&page_size=10&page=1', - content_type='application/json' + "api/v1/biogrid/interactor/by_symbol_starts_with?symbol=trem&taxonomy_id=9606&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "entrez": 54210, - "symbol": "TREM1", - "systematic_name": None, - "taxonomy": { - "organism_name": "Homo sapiens", - "taxonomy_id": 9606 - }, - "trembl": "Q38L15", - "uniprot": "Q9NP99" + "entrez": 54210, + "symbol": "TREM1", + "systematic_name": None, + "taxonomy": {"organism_name": "Homo sapiens", "taxonomy_id": 9606}, + "trembl": "Q38L15", + "uniprot": "Q9NP99", } assert output[NUM_RESULTS] == 4 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_chebi.py b/tests/test_api/test_ebel/test_chebi.py index 8d13ce0..573fff0 100644 --- a/tests/test_api/test_ebel/test_chebi.py +++ b/tests/test_api/test_ebel/test_chebi.py @@ -1,15 +1,12 @@ """CHEBI API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS +from .constants import NUM_RESULTS, RESULTS class TestChebi: def test_get_compound_by_name(self, client): - response = client.get( - 'api/v1/chebi/compound/by_name?name=donepezil', - content_type='application/json' - ) + response = client.get("api/v1/chebi/compound/by_name?name=donepezil", content_type="application/json") output = format_response_data(response) expected_results = { # only check some of the generated information "source": "ChEBI", @@ -26,23 +23,15 @@ def test_get_compound_by_name(self, client): def test_get_compound_name_by_name_starts_with(self, client): response = client.get( - 'api/v1/chebi/compound_name/by_name_starts_with?name=donep', - content_type='application/json' + "api/v1/chebi/compound_name/by_name_starts_with?name=donep", content_type="application/json" ) output = format_response_data(response) - expected_results = { - "donepezil": 53289, - "donepezil (1+)": 145498, - "donepezil hydrochloride": 4696 - } + expected_results = {"donepezil": 53289, "donepezil (1+)": 145498, "donepezil hydrochloride": 4696} assert isinstance(output, dict) assert expected_results == output def test_get_compound_by_id(self, client): - response = client.get( - 'api/v1/chebi/compound/by_id?id=53289', - content_type='application/json' - ) + response = client.get("api/v1/chebi/compound/by_id?id=53289", content_type="application/json") output = format_response_data(response) expected_results = { # only check some of the generated information "source": "ChEBI", @@ -57,8 +46,8 @@ def test_get_compound_by_id(self, client): def test_get_compound_by_other_db_accession(self, client): response = client.get( - 'api/v1/chebi/compound/by_other_db_accession?db_name=DrugBank%20accession&accession_number=DB00843', - content_type='application/json' + "api/v1/chebi/compound/by_other_db_accession?db_name=DrugBank%20accession&accession_number=DB00843", + content_type="application/json", ) output = format_response_data(response) expected_results = { # only check some of the generated information @@ -75,16 +64,16 @@ def test_get_compound_by_other_db_accession(self, client): def test_get_compound_reference(self, client): response = client.get( - 'api/v1/chebi/compound_reference?reference_id=US2006025345&reference_db_name=Patent&reference_name=Substituted%20ethane-1%2C2-diamines%20for%20the%20treatment%20of%20Alzheimer%27s%20disease&page_size=10&page=1', - content_type='application/json' + "api/v1/chebi/compound_reference?reference_id=US2006025345&reference_db_name=Patent&reference_name=Substituted%20ethane-1%2C2-diamines%20for%20the%20treatment%20of%20Alzheimer%27s%20disease&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "compound_id": 17596, - "location_in_ref": None, - "reference_db_name": "Patent", - "reference_id": "US2006025345", - "reference_name": "Substituted ethane-1,2-diamines for the treatment of Alzheimer's disease" + "compound_id": 17596, + "location_in_ref": None, + "reference_db_name": "Patent", + "reference_id": "US2006025345", + "reference_name": "Substituted ethane-1,2-diamines for the treatment of Alzheimer's disease", } assert output[NUM_RESULTS] > 20 results = output[RESULTS] @@ -94,10 +83,7 @@ def test_get_compound_reference(self, client): assert hit == expected_results def test_get_relation(self, client): - response = client.get( - 'api/v1/chebi/relation?final_id=53289', - content_type='application/json' - ) + response = client.get("api/v1/chebi/relation?final_id=53289", content_type="application/json") output = format_response_data(response) assert isinstance(output, list) assert len(output) > 0 @@ -107,8 +93,8 @@ def test_get_relation(self, client): def test_get_bel_chebi_ids(self, client): response = client.get( - 'api/v1/chebi/ebel/nodes?name=ATP&namespace=CHEBI&chebi=15422&page_size=10&page=1', - content_type='application/json' + "api/v1/chebi/ebel/nodes?name=ATP&namespace=CHEBI&chebi=15422&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) assert isinstance(output, dict) @@ -116,6 +102,6 @@ def test_get_bel_chebi_ids(self, client): assert isinstance(results, list) assert len(output) >= 5 hit = results[0] - assert all([col in hit for col in ('bel', 'chebi', 'name', 'namespace', 'rid')]) - assert hit['name'] == "ATP" + assert all([col in hit for col in ("bel", "chebi", "name", "namespace", "rid")]) + assert hit["name"] == "ATP" assert hit["chebi"] == 15422 diff --git a/tests/test_api/test_ebel/test_clinical_trials_gov.py b/tests/test_api/test_ebel/test_clinical_trials_gov.py index 46bda46..fadd6de 100644 --- a/tests/test_api/test_ebel/test_clinical_trials_gov.py +++ b/tests/test_api/test_ebel/test_clinical_trials_gov.py @@ -1,49 +1,37 @@ """ClinicalTrials API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS +from .constants import NUM_RESULTS, RESULTS class TestClinicalTrials: def test_get_ct_by_nct_id(self, client): - response = client.get( - 'api/v1/clinical_trial/by_nct_id?nct_id=NCT00571064', - content_type='application/json' - ) + response = client.get("api/v1/clinical_trial/by_nct_id?nct_id=NCT00571064", content_type="application/json") output = format_response_data(response) expected_results = { - "brief_summary": "This is a study to determine the effectiveness and safety of donepezil hydrochloride (E2020) used to treat residents of assisted living facilities diagnosed with mild, moderate, or severe stage Alzheimer's disease.", - "brief_title": "The Effectiveness And Safety Of Donepezil Hydrochloride (E2020) In Subjects With Mild To Severe Alzheimer's Disease Residing In An Assisted Living Facility", - "completion_date": "April 22, 2009", - "conditions": [ - "Mild to Severe Alzheimer's Disease" - ], - "detailed_description": None, - "id": 47330, - "interventions": [ - { - "intervention_name": "Donepezil HCl", - "intervention_type": "Drug" - } - ], - "is_fda_regulated_drug": None, - "keywords": [], - "mesh_terms": [ - "Alzheimer Disease" - ], - "nct_id": "NCT00571064", - "official_title": "A 12-Week, Multicenter, Open Label Study To Evaluate The Effectiveness And Safety Of Donepezil Hydrochloride (E2020) In Subjects With Mild To Severe Alzheimer's Disease Residing In An Assisted Living Facility", - "org_study_id": "E2020-A001-415", - "overall_status": "Completed", - "patient_data_ipd_description": None, - "patient_data_sharing_ipd": None, - "phase": "Phase 4", - "start_date": "January 2008", - "study_design_intervention_model": "Single Group Assignment", - "study_design_masking": "None (Open Label)", - "study_design_primary_purpose": "Treatment", - "study_type": "Interventional" + "brief_summary": "This is a study to determine the effectiveness and safety of donepezil hydrochloride (E2020) used to treat residents of assisted living facilities diagnosed with mild, moderate, or severe stage Alzheimer's disease.", + "brief_title": "The Effectiveness And Safety Of Donepezil Hydrochloride (E2020) In Subjects With Mild To Severe Alzheimer's Disease Residing In An Assisted Living Facility", + "completion_date": "April 22, 2009", + "conditions": ["Mild to Severe Alzheimer's Disease"], + "detailed_description": None, + "id": 47330, + "interventions": [{"intervention_name": "Donepezil HCl", "intervention_type": "Drug"}], + "is_fda_regulated_drug": None, + "keywords": [], + "mesh_terms": ["Alzheimer Disease"], + "nct_id": "NCT00571064", + "official_title": "A 12-Week, Multicenter, Open Label Study To Evaluate The Effectiveness And Safety Of Donepezil Hydrochloride (E2020) In Subjects With Mild To Severe Alzheimer's Disease Residing In An Assisted Living Facility", + "org_study_id": "E2020-A001-415", + "overall_status": "Completed", + "patient_data_ipd_description": None, + "patient_data_sharing_ipd": None, + "phase": "Phase 4", + "start_date": "January 2008", + "study_design_intervention_model": "Single Group Assignment", + "study_design_masking": "None (Open Label)", + "study_design_primary_purpose": "Treatment", + "study_type": "Interventional", } assert isinstance(output, dict) @@ -51,8 +39,8 @@ def test_get_ct_by_nct_id(self, client): def test_get_ct_by_mesh_term(self, client): response = client.get( - 'api/v1/clinical_trial/by_mesh_term?mesh_term=Alzheimer%20Disease&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/by_mesh_term?mesh_term=Alzheimer%20Disease&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -68,8 +56,8 @@ def test_get_ct_by_mesh_term(self, client): def test_get_ct_by_intervention(self, client): response = client.get( - 'api/v1/clinical_trial/by_intervention?intervention_name=Donepezil&intervention_type=Drug&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/by_intervention?intervention_name=Donepezil&intervention_type=Drug&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -87,17 +75,12 @@ def test_get_ct_by_intervention(self, client): def test_get_ct_by_keyword(self, client): response = client.get( - 'api/v1/clinical_trial/by_keyword?keyword=Early%20Alzheimer&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/by_keyword?keyword=Early%20Alzheimer&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "keyword": "Early Alzheimer", - "nct_ids": [ - "NCT01439555" - ] - } + expected_results = {"keyword": "Early Alzheimer", "nct_ids": ["NCT01439555"]} assert output[NUM_RESULTS] == 1 results = output[RESULTS] @@ -108,17 +91,12 @@ def test_get_ct_by_keyword(self, client): def test_get_ct_by_condition(self, client): response = client.get( - 'api/v1/clinical_trial/by_condition?condition=Mild%20to%20Severe%20Alzheimer%27s%20Disease&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/by_condition?condition=Mild%20to%20Severe%20Alzheimer%27s%20Disease&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "condition": "Mild to Severe Alzheimer's Disease", - "nct_ids": [ - "NCT00571064" - ] - } + expected_results = {"condition": "Mild to Severe Alzheimer's Disease", "nct_ids": ["NCT00571064"]} assert output[NUM_RESULTS] == 1 results = output[RESULTS] @@ -129,8 +107,8 @@ def test_get_ct_by_condition(self, client): def test_get_mesh_term_starts_with(self, client): response = client.get( - 'api/v1/clinical_trial/mesh_term/starts_with?mesh_term=alz&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/mesh_term/starts_with?mesh_term=alz&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -143,8 +121,8 @@ def test_get_mesh_term_starts_with(self, client): def test_get_keyword_starts_with(self, client): response = client.get( - 'api/v1/clinical_trial/keyword/starts_with?keyword=alz&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/keyword/starts_with?keyword=alz&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -157,8 +135,8 @@ def test_get_keyword_starts_with(self, client): def test_get_condition_starts_with(self, client): response = client.get( - 'api/v1/clinical_trial/condition/starts_with?condition=alz&page_size=10&page=1', - content_type='application/json' + "api/v1/clinical_trial/condition/starts_with?condition=alz&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) diff --git a/tests/test_api/test_ebel/test_clinvar.py b/tests/test_api/test_ebel/test_clinvar.py index 09f63b6..95db926 100644 --- a/tests/test_api/test_ebel/test_clinvar.py +++ b/tests/test_api/test_ebel/test_clinvar.py @@ -1,56 +1,53 @@ """ClinVAR API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestClinvar: - def test_get_clinvar(self, client): response = client.get( - 'api/v1/clinvar?allele_id=705202&type=single%20nucleotide%20variant&name=NM_001772.4%28CD33%29%3Ac.131T%3EC%20%28p.Phe44Ser%29&gene_id=945&gene_symbol=CD33&hgnc_id=HGNC%3A1659&clinical_significance=Benign&last_evaluated=Jul%2016%2C%202018&rs_db_snp=61736469&rcvaccession=RCV000957991&origin=germline&origin_simple=germline&assembly=GRCh37&chromosome_accession=NC_000019.9&chromosome=19&start=51728567&stop=51728567&reference_allele=na&alternate_allele=na&cytogenetic=19q13.41&review_status=criteria%20provided%2C%20single%20submitter&number_submitters=1&tested_in_gtr=N&submitter_categories=2&variation_id=777507&position_vcf=51728567&reference_allele_vcf=T&alternate_allele_vcf=C&page_size=10&page=1', - content_type='application/json' + "api/v1/clinvar?allele_id=705202&type=single%20nucleotide%20variant&name=NM_001772.4%28CD33%29%3Ac.131T%3EC%20%28p.Phe44Ser%29&gene_id=945&gene_symbol=CD33&hgnc_id=HGNC%3A1659&clinical_significance=Benign&last_evaluated=Jul%2016%2C%202018&rs_db_snp=61736469&rcvaccession=RCV000957991&origin=germline&origin_simple=germline&assembly=GRCh37&chromosome_accession=NC_000019.9&chromosome=19&start=51728567&stop=51728567&reference_allele=na&alternate_allele=na&cytogenetic=19q13.41&review_status=criteria%20provided%2C%20single%20submitter&number_submitters=1&tested_in_gtr=N&submitter_categories=2&variation_id=777507&position_vcf=51728567&reference_allele_vcf=T&alternate_allele_vcf=C&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "allele_id": 705202, - "alternate_allele": "na", - "alternate_allele_vcf": "C", - "assembly": "GRCh37", - "chromosome": "19", - "chromosome_accession": "NC_000019.9", - "clin_sig_simple": 0, - "clinical_significance": "Benign", - "cytogenetic": "19q13.41", - "gene_id": 945, - "gene_symbol": "CD33", - "guidelines": None, - "hgnc_id": "HGNC:1659", - "id": 1132137, - "last_evaluated": "Jul 16, 2018", - "name": "NM_001772.4(CD33):c.131T>C (p.Phe44Ser)", - "nsv_esv_db_var": None, - "number_submitters": 1, - "origin": "germline", - "origin_simple": "germline", - "otherIdentifiers": [], - "phenotypeMedgens": [ - "CN517202" - ], - "position_vcf": 51728567, - "rcvaccession": "RCV000957991", - "reference_allele": "na", - "reference_allele_vcf": "T", - "review_status": "criteria provided, single submitter", - "rs_db_snp": 61736469, - "start": 51728567, - "stop": 51728567, - "submitter_categories": 2, - "tested_in_gtr": "N", - "type": "single nucleotide variant", - "variation_id": 777507 + "allele_id": 705202, + "alternate_allele": "na", + "alternate_allele_vcf": "C", + "assembly": "GRCh37", + "chromosome": "19", + "chromosome_accession": "NC_000019.9", + "clin_sig_simple": 0, + "clinical_significance": "Benign", + "cytogenetic": "19q13.41", + "gene_id": 945, + "gene_symbol": "CD33", + "guidelines": None, + "hgnc_id": "HGNC:1659", + "id": 1132137, + "last_evaluated": "Jul 16, 2018", + "name": "NM_001772.4(CD33):c.131T>C (p.Phe44Ser)", + "nsv_esv_db_var": None, + "number_submitters": 1, + "origin": "germline", + "origin_simple": "germline", + "otherIdentifiers": [], + "phenotypeMedgens": ["CN517202"], + "position_vcf": 51728567, + "rcvaccession": "RCV000957991", + "reference_allele": "na", + "reference_allele_vcf": "T", + "review_status": "criteria provided, single submitter", + "rs_db_snp": 61736469, + "start": 51728567, + "stop": 51728567, + "submitter_categories": 2, + "tested_in_gtr": "N", + "type": "single nucleotide variant", + "variation_id": 777507, } assert output[NUM_RESULTS] == 1 @@ -62,8 +59,8 @@ def test_get_clinvar(self, client): def test_get_clinvar_simple(self, client): response = client.get( - 'api/v1/clinvar/simple?phenotype=Alzheimer%20disease%202&hgnc_id=HGNC%3A4886&assembly=GRCh38&page_size=10&page=1', - content_type='application/json' + "api/v1/clinvar/simple?phenotype=Alzheimer%20disease%202&hgnc_id=HGNC%3A4886&assembly=GRCh38&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { @@ -72,7 +69,7 @@ def test_get_clinvar_simple(self, client): "gene_symbol": "HFE", "hgnc_id": "HGNC:4886", "id": 14, - "rs_db_snp": 1800562 + "rs_db_snp": 1800562, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -84,8 +81,8 @@ def test_get_clinvar_simple(self, client): def test_get_phenotype_starts_with(self, client): response = client.get( - 'api/v1/clinvar/phenotype/starts_with?phenotype=MAN1B1-Related%20Disor&page_size=10&page=1', - content_type='application/json' + "api/v1/clinvar/phenotype/starts_with?phenotype=MAN1B1-Related%20Disor&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) assert output[NUM_RESULTS] == 1 @@ -97,19 +94,19 @@ def test_get_phenotype_starts_with(self, client): def test_get_by_other_identifier(self, client): response = client.get( - 'api/v1/clinvar/phenotype/by_other_identifier?db=OMIM&identifier=613653.%25&page_size=10&page=1', - content_type='application/json' + "api/v1/clinvar/phenotype/by_other_identifier?db=OMIM&identifier=613653.%25&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "allele_id": 15041, - "assembly": "GRCh37", - "db": "OMIM", - "gene_symbol": "AP5Z1", - "hgnc_id": "HGNC:22197", - "id": 1, - "identifier": "613653.0001", - "rs_db_snp": 397704705 + "allele_id": 15041, + "assembly": "GRCh37", + "db": "OMIM", + "gene_symbol": "AP5Z1", + "hgnc_id": "HGNC:22197", + "id": 1, + "identifier": "613653.0001", + "rs_db_snp": 397704705, } assert output[NUM_RESULTS] > 10 assert output[PAGE_SIZE] == 10 @@ -121,18 +118,18 @@ def test_get_by_other_identifier(self, client): def test_get_by_medgen(self, client): response = client.get( - 'api/v1/clinvar/phenotype/by_medgen?identifier=C0002395&page_size=10&page=1', - content_type='application/json' + "api/v1/clinvar/phenotype/by_medgen?identifier=C0002395&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "allele_id": 15048, - "assembly": "GRCh37", - "gene_symbol": "HFE", - "hgnc_id": "HGNC:4886", - "id": 13, - "identifier": "C0002395", - "rs_db_snp": 1800562 + "allele_id": 15048, + "assembly": "GRCh37", + "gene_symbol": "HFE", + "hgnc_id": "HGNC:4886", + "id": 13, + "identifier": "C0002395", + "rs_db_snp": 1800562, } assert output[NUM_RESULTS] > 300 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_disgenet.py b/tests/test_api/test_ebel/test_disgenet.py index edcf2f8..c7c5284 100644 --- a/tests/test_api/test_ebel/test_disgenet.py +++ b/tests/test_api/test_ebel/test_disgenet.py @@ -1,27 +1,40 @@ """DisGeNet API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS +from .constants import NUM_RESULTS, RESULTS class TestDisgenet: def test_get_sources(self, client): - response = client.get( - 'api/v1/disgenet/sources', - content_type='application/json' - ) + response = client.get("api/v1/disgenet/sources", content_type="application/json") output = format_response_data(response) - expected_sources = ("BEFREE", "CGI", "CLINGEN", "CLINVAR", "CTD_human", "CTD_mouse", "CTD_rat", - "GENOMICS_ENGLAND", "GWASCAT", "GWASDB", "HPO", "LHGDN", "MGD", "ORPHANET", "PSYGENET", - "RGD", "UNIPROT") + expected_sources = ( + "BEFREE", + "CGI", + "CLINGEN", + "CLINVAR", + "CTD_human", + "CTD_mouse", + "CTD_rat", + "GENOMICS_ENGLAND", + "GWASCAT", + "GWASDB", + "HPO", + "LHGDN", + "MGD", + "ORPHANET", + "PSYGENET", + "RGD", + "UNIPROT", + ) assert isinstance(output, dict) assert all([col in output for col in expected_sources]) assert all([isinstance(val, int) for val in output.values()]) def test_get_disease_name_starts_with(self, client): response = client.get( - 'api/v1/disgenet/disease_name/starts_with?disease_name=alz&page_size=10&page=1', - content_type='application/json' + "api/v1/disgenet/disease_name/starts_with?disease_name=alz&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { @@ -34,7 +47,7 @@ def test_get_disease_name_starts_with(self, client): "ALZHEIMER DISEASE 6, LATE-ONSET": "C1854187", "Alzheimer Disease 12": "C1970209", "Alzheimer Disease 14": "C1970144", - "Alzheimer Disease 7": "C1853555" + "Alzheimer Disease 7": "C1853555", } assert output[NUM_RESULTS] >= 26 results = output[RESULTS] @@ -43,8 +56,8 @@ def test_get_disease_name_starts_with(self, client): def test_get_gene_symbol_starts_with(self, client): response = client.get( - 'api/v1/disgenet/gene_symbol/starts_with?gene_symbol=CD18&page_size=10&page=1', - content_type='application/json' + "api/v1/disgenet/gene_symbol/starts_with?gene_symbol=CD18&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = {"CD180": 4064} @@ -55,18 +68,18 @@ def test_get_gene_symbol_starts_with(self, client): def test_get_gene_disease_pmid_associations(self, client): response = client.get( - 'api/v1/disgenet/gene_disease_pmid_associations?gene_id=945&gene_symbol=CD33&disease_id=C0023418&disease_name=leukemia&pmid=15388576&source=LHGDN&page_size=10&page=1', - content_type='application/json' + "api/v1/disgenet/gene_disease_pmid_associations?gene_id=945&gene_symbol=CD33&disease_id=C0023418&disease_name=leukemia&pmid=15388576&source=LHGDN&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "disease_id": "C0023418", - "disease_name": "leukemia", - "gene_id": 945, - "gene_symbol": "CD33", - "pmid": 15388576, - "score": 0.1, - "source": "LHGDN" + "disease_id": "C0023418", + "disease_name": "leukemia", + "gene_id": 945, + "gene_symbol": "CD33", + "pmid": 15388576, + "score": 0.1, + "source": "LHGDN", } assert output[NUM_RESULTS] == 1 results = output[RESULTS] @@ -77,19 +90,19 @@ def test_get_gene_disease_pmid_associations(self, client): def test_get_variant_disease_pmid_associations(self, client): response = client.get( - 'api/v1/disgenet/variant_disease_pmid_associations?snp_id=rs1001179&chromosome=11&position=34438684&disease_id=C0002395&disease_name=Alzheimer%27s%20Disease&pmid=18248894&source=BEFREE&page_size=10&page=1', - content_type='application/json' + "api/v1/disgenet/variant_disease_pmid_associations?snp_id=rs1001179&chromosome=11&position=34438684&disease_id=C0002395&disease_name=Alzheimer%27s%20Disease&pmid=18248894&source=BEFREE&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "chromosome": "11", - "disease_id": "C0002395", - "disease_name": "Alzheimer's Disease", - "pmid": 18248894, - "position": 34438684, - "score": 0.01, - "snp_id": "rs1001179", - "source": "BEFREE" + "chromosome": "11", + "disease_id": "C0002395", + "disease_name": "Alzheimer's Disease", + "pmid": 18248894, + "position": 34438684, + "score": 0.01, + "snp_id": "rs1001179", + "source": "BEFREE", } assert output[NUM_RESULTS] == 1 results = output[RESULTS] @@ -100,8 +113,8 @@ def test_get_variant_disease_pmid_associations(self, client): def test_get_ebel_has_snp_disgenet(self, client): response = client.get( - 'api/v1/disgenet/ebel?disease_name=Alzheimer%27s%20Disease&rs_number=rs10408847&pmid=29777097&source=GWASCAT&gene_symbol=MARK4&page_size=10&page=1', - content_type='application/json' + "api/v1/disgenet/ebel?disease_name=Alzheimer%27s%20Disease&rs_number=rs10408847&pmid=29777097&source=GWASCAT&gene_symbol=MARK4&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expect_cols = ("disease_name", "gene_symbol", "pmids", "relation", "rs_number", "score", "source") diff --git a/tests/test_api/test_ebel/test_drugbank.py b/tests/test_api/test_ebel/test_drugbank.py index 61b49dc..1137116 100644 --- a/tests/test_api/test_ebel/test_drugbank.py +++ b/tests/test_api/test_ebel/test_drugbank.py @@ -1,15 +1,12 @@ """DrugBank API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestDrugbank: def test_get_by_id(self, client): - response = client.get( - 'api/v1/drugbank/by_id?drugbank_id=DB00843', - content_type='application/json' - ) + response = client.get("api/v1/drugbank/by_id?drugbank_id=DB00843", content_type="application/json") output = format_response_data(response) expected_results = { # Only check a subset "cas_number": "120014-06-4", @@ -22,8 +19,8 @@ def test_get_by_id(self, client): def test_get_drugbank(self, client): response = client.get( - 'api/v1/drugbank?drugbank_id=DB00843&name=Donepezil&description=%25Aricept%25&cas_number=120014-06-4&unii=8SSC91326P&state=solid&indication=%25mild%20to%20moderate%20Alzheimer%E2%80%99s%20Disease%25&pharmacodynamics=%25inhibiting%25&toxicity=%25rat%20oral%20LD50%25&metabolism=%25CYP3A4%25&absorption=%25gastrointestinal%20tract%25&half_life=%25hours%25&route_of_elimination=%25urine%25&volume_of_distribution=%25mg%20dose%25&clearance=%25plasma%25&mechanism_of_action=%25cognitive%20and%20behavioral%20decline%25&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank?drugbank_id=DB00843&name=Donepezil&description=%25Aricept%25&cas_number=120014-06-4&unii=8SSC91326P&state=solid&indication=%25mild%20to%20moderate%20Alzheimer%E2%80%99s%20Disease%25&pharmacodynamics=%25inhibiting%25&toxicity=%25rat%20oral%20LD50%25&metabolism=%25CYP3A4%25&absorption=%25gastrointestinal%20tract%25&half_life=%25hours%25&route_of_elimination=%25urine%25&volume_of_distribution=%25mg%20dose%25&clearance=%25plasma%25&mechanism_of_action=%25cognitive%20and%20behavioral%20decline%25&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { # Only check a subset @@ -41,8 +38,8 @@ def test_get_drugbank(self, client): def test_get_interaction(self, client): response = client.get( - 'api/v1/drugbank/interaction?drugbank_id=DB06605&name=Apixaban&description=%25hemorrhage%25&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/interaction?drugbank_id=DB06605&name=Apixaban&description=%25hemorrhage%25&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_cols = ("description", "drugbank_id", "interactor_drugbank_id", "name") @@ -57,8 +54,7 @@ def test_get_interaction(self, client): def test_get_pathway(self, client): response = client.get( - 'api/v1/drugbank/pathway?drugbank_id=DB00114&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/pathway?drugbank_id=DB00114&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_cols = ("drugbank_id", "smpdb_id") @@ -73,14 +69,10 @@ def test_get_pathway(self, client): def test_get_status(self, client): response = client.get( - 'api/v1/drugbank/status?drugbank_id=DB06605&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/status?drugbank_id=DB06605&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) - expected_results = { - "drugbank_id": "DB06605", - "smpdb_id": "approved" - } + expected_results = {"drugbank_id": "DB06605", "smpdb_id": "approved"} assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -91,8 +83,7 @@ def test_get_status(self, client): def test_get_patent(self, client): response = client.get( - 'api/v1/drugbank/patent?drugbank_id=DB00843&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/patent?drugbank_id=DB00843&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_cols = ("approved", "country", "drugbank_id", "expires", "number", "pediatric_extension") @@ -107,15 +98,11 @@ def test_get_patent(self, client): def test_get_external_identifier(self, client): response = client.get( - 'api/v1/drugbank/external_identifier?drugbank_id=DB00114&resource=BindingDB&identifier=50118216&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/external_identifier?drugbank_id=DB00114&resource=BindingDB&identifier=50118216&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "drugbank_id": "DB00114", - "identifier": "50118216", - "resource": "BindingDB" - } + expected_results = {"drugbank_id": "DB00114", "identifier": "50118216", "resource": "BindingDB"} assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -126,8 +113,7 @@ def test_get_external_identifier(self, client): def test_get_reference(self, client): response = client.get( - 'api/v1/drugbank/reference?drugbank_id=DB06605&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/reference?drugbank_id=DB06605&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_cols = ("drugbank_id", "pmid") @@ -143,15 +129,14 @@ def test_get_reference(self, client): def test_get_target(self, client): response = client.get( - 'api/v1/drugbank/target?drugbank_id=DB06605&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/target?drugbank_id=DB06605&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_results = { - "action": "inhibitor", - "drugbank_id": "DB06605", - "known_action": "yes", - "uniprot": "P00742" + "action": "inhibitor", + "drugbank_id": "DB06605", + "known_action": "yes", + "uniprot": "P00742", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -163,8 +148,7 @@ def test_get_target(self, client): def test_get_product_name(self, client): response = client.get( - 'api/v1/drugbank/product_name?drugbank_id=DB06605&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/product_name?drugbank_id=DB06605&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_cols = ("drugbank_id", "name") @@ -179,8 +163,7 @@ def test_get_product_name(self, client): def test_get_synonym(self, client): response = client.get( - 'api/v1/drugbank/synonym?drugbank_id=DB06605&page_size=10&page=1', - content_type='application/json' + "api/v1/drugbank/synonym?drugbank_id=DB06605&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) expected_cols = ("drugbank_id", "synonym") diff --git a/tests/test_api/test_ebel/test_ebel.py b/tests/test_api/test_ebel/test_ebel.py index eee8a68..5a7b91f 100644 --- a/tests/test_api/test_ebel/test_ebel.py +++ b/tests/test_api/test_ebel/test_ebel.py @@ -1,7 +1,7 @@ """eBEL API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestEbel: @@ -15,7 +15,7 @@ def test_get_intact_by_uniprot(self, client): def test_find_all(self, client): response = client.post( - 'api/v1/ebel/find_all', + "api/v1/ebel/find_all", json={"term": "CD33"}, ) output = format_response_data(response) diff --git a/tests/test_api/test_ebel/test_ensembl.py b/tests/test_api/test_ebel/test_ensembl.py index 43d3aea..7823a5b 100644 --- a/tests/test_api/test_ebel/test_ensembl.py +++ b/tests/test_api/test_ebel/test_ensembl.py @@ -1,28 +1,27 @@ """EnsEMBL API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestEnsembl: def test_get_ensembl(self, client): response = client.get( - 'api/v1/ebel/ensembl/ensembl?enst=ENST00000436584&chromosome=19', - content_type='application/json' + "api/v1/ebel/ensembl/ensembl?enst=ENST00000436584&chromosome=19", content_type="application/json" ) output = format_response_data(response) expected_results = { - "chromosome": "19", - "enst": "ENST00000436584", - "gene_id": "ENSG00000105383.15", - "gene_id_short": "ENSG00000105383", - "hgnc_id": "HGNC:1659", - "id": 29665, - "orientation": 1, - "start": 51225064, - "stop": 51236572, - "symbol": "CD33", - "version": 38 + "chromosome": "19", + "enst": "ENST00000436584", + "gene_id": "ENSG00000105383.15", + "gene_id_short": "ENSG00000105383", + "hgnc_id": "HGNC:1659", + "id": 29665, + "orientation": 1, + "start": 51225064, + "stop": 51236572, + "symbol": "CD33", + "version": 38, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_gwas_catalog.py b/tests/test_api/test_ebel/test_gwas_catalog.py index f1d7db6..0d403a6 100644 --- a/tests/test_api/test_ebel/test_gwas_catalog.py +++ b/tests/test_api/test_ebel/test_gwas_catalog.py @@ -1,54 +1,52 @@ """GWAS Catalog API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestGwasCatalog: def test_get_gwas_catalog(self, client): response = client.get( - 'api/v1/gwas_catalog/gwas_catalog?date_added_to_catalog=2013-10-24&pubmedid=23563609&first_author=Wheeler%20E&date=2013-04-07&journal=Nat%20Genet&link=www.ncbi.nlm.nih.gov%2Fpubmed%2F23563609&study=Genome-wide%20SNP%20and%20CNV%20analysis%20identifies%20common%20and%20low-frequency%20variants%20associated%20with%20severe%20early-onset%20obesity.&disease_trait=Obesity%20%28early%20onset%20extreme%29&initial_sample_size=1%2C509%20European%20ancestry%20cases%2C%205%2C380%20European%20ancestry%20controls&replication_sample_size=971%20European%20ancestry%20cases%2C%201%2C990%20European%20ancestry%20controls®ion=16q12.2&chr_id=16&chr_pos=53767042&reported_gene_s=FTO&mapped_gene=FTO&strongest_snp_risk_allele=rs1421085-C&snp=rs1421085&snp_id_current=1421085&context=intron_variant&risk_allele_frequency=0.41&p_value=3e-28&pvalue_mlog=27.5229&or_or_beta=1.44&_95_ci_text=%5B1.35-1.54%5D&platform_snps_passing_qc=Affymetrix%20%5B~%202000000%5D%20%28imputed%29&cnv=N', - content_type='application/json' + "api/v1/gwas_catalog/gwas_catalog?date_added_to_catalog=2013-10-24&pubmedid=23563609&first_author=Wheeler%20E&date=2013-04-07&journal=Nat%20Genet&link=www.ncbi.nlm.nih.gov%2Fpubmed%2F23563609&study=Genome-wide%20SNP%20and%20CNV%20analysis%20identifies%20common%20and%20low-frequency%20variants%20associated%20with%20severe%20early-onset%20obesity.&disease_trait=Obesity%20%28early%20onset%20extreme%29&initial_sample_size=1%2C509%20European%20ancestry%20cases%2C%205%2C380%20European%20ancestry%20controls&replication_sample_size=971%20European%20ancestry%20cases%2C%201%2C990%20European%20ancestry%20controls®ion=16q12.2&chr_id=16&chr_pos=53767042&reported_gene_s=FTO&mapped_gene=FTO&strongest_snp_risk_allele=rs1421085-C&snp=rs1421085&snp_id_current=1421085&context=intron_variant&risk_allele_frequency=0.41&p_value=3e-28&pvalue_mlog=27.5229&or_or_beta=1.44&_95_ci_text=%5B1.35-1.54%5D&platform_snps_passing_qc=Affymetrix%20%5B~%202000000%5D%20%28imputed%29&cnv=N", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "_95_ci_text": "[1.35-1.54]", - "chr_id": "16", - "chr_pos": "53767042", - "cnv": "N", - "context": "intron_variant", - "date": "2013-04-07", - "date_added_to_catalog": "2013-10-24", - "disease_trait": "Obesity (early onset extreme)", - "downstream_gene_distance": None, - "downstream_gene_id": None, - "first_author": "Wheeler E", - "id": 146661, - "initial_sample_size": "1,509 European ancestry cases, 5,380 European ancestry controls", - "intergenic": 0, - "journal": "Nat Genet", - "link": "www.ncbi.nlm.nih.gov/pubmed/23563609", - "mapped_gene": "FTO", - "merged": 0, - "or_or_beta": 1.44, - "p_value": 3e-28, - "p_value_text": None, - "platform_snps_passing_qc": "Affymetrix [~ 2000000] (imputed)", - "pubmedid": 23563609, - "pvalue_mlog": 27.5229, - "region": "16q12.2", - "replication_sample_size": "971 European ancestry cases, 1,990 European ancestry controls", - "reported_gene_s": "FTO", - "risk_allele_frequency": "0.41", - "snp": "rs1421085", - "snp_genes": [ - "ENSG00000140718" - ], - "snp_id_current": "1421085", - "strongest_snp_risk_allele": "rs1421085-C", - "study": "Genome-wide SNP and CNV analysis identifies common and low-frequency variants associated with severe early-onset obesity.", - "upstream_gene_distance": None, - "upstream_gene_id": None + "_95_ci_text": "[1.35-1.54]", + "chr_id": "16", + "chr_pos": "53767042", + "cnv": "N", + "context": "intron_variant", + "date": "2013-04-07", + "date_added_to_catalog": "2013-10-24", + "disease_trait": "Obesity (early onset extreme)", + "downstream_gene_distance": None, + "downstream_gene_id": None, + "first_author": "Wheeler E", + "id": 146661, + "initial_sample_size": "1,509 European ancestry cases, 5,380 European ancestry controls", + "intergenic": 0, + "journal": "Nat Genet", + "link": "www.ncbi.nlm.nih.gov/pubmed/23563609", + "mapped_gene": "FTO", + "merged": 0, + "or_or_beta": 1.44, + "p_value": 3e-28, + "p_value_text": None, + "platform_snps_passing_qc": "Affymetrix [~ 2000000] (imputed)", + "pubmedid": 23563609, + "pvalue_mlog": 27.5229, + "region": "16q12.2", + "replication_sample_size": "971 European ancestry cases, 1,990 European ancestry controls", + "reported_gene_s": "FTO", + "risk_allele_frequency": "0.41", + "snp": "rs1421085", + "snp_genes": ["ENSG00000140718"], + "snp_id_current": "1421085", + "strongest_snp_risk_allele": "rs1421085-C", + "study": "Genome-wide SNP and CNV analysis identifies common and low-frequency variants associated with severe early-onset obesity.", + "upstream_gene_distance": None, + "upstream_gene_id": None, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_hgnc.py b/tests/test_api/test_ebel/test_hgnc.py index 807eb1d..79af636 100644 --- a/tests/test_api/test_ebel/test_hgnc.py +++ b/tests/test_api/test_ebel/test_hgnc.py @@ -1,60 +1,39 @@ """HGNC API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestHgnc: def test_get_by_symbol(self, client): - response = client.get( - 'api/v1/hgnc/by_symbol?symbol=CD33', - content_type='application/json' - ) + response = client.get("api/v1/hgnc/by_symbol?symbol=CD33", content_type="application/json") output = format_response_data(response) expected_results = { "agr": "HGNC:1659", - "alias_names": [ - "sialic acid binding Ig-like lectin 3" - ], - "alias_symbols": [ - "SIGLEC3", - "SIGLEC-3", - "p67", - "FLJ00391" - ], + "alias_names": ["sialic acid binding Ig-like lectin 3"], + "alias_symbols": ["SIGLEC3", "SIGLEC-3", "p67", "FLJ00391"], "bioparadigms_slc": None, - "ccdss": [ - "CCDS46157", - "CCDS54299", - "CCDS33084" - ], + "ccdss": ["CCDS46157", "CCDS54299", "CCDS33084"], "cd": "CD33", "cosmic": None, "date_approved_reserved": "1986-01-01", "date_modified": "2016-10-05", "date_name_changed": "2006-03-28", "date_symbol_changed": None, - "enas": [ - "M23197" - ], + "enas": ["M23197"], "ensembl_gene_id": "ENSG00000105383", "entrez_id": 945, "enzymes": [], - "gene_group_ids": [ - 471, - 590, - 745 - ], + "gene_group_ids": [471, 590, 745], "gene_group_names": [ "CD molecules", "V-set domain containing", - "Sialic acid binding Ig like lectins" + "Sialic acid binding Ig like lectins", ], "hgnc_id": "HGNC:1659", "homeodb": None, "horde_id": None, "imgt": None, - "intermediate_filament_db": None, "iuphar": "objectId:2601", "kznf_gene_catalog": None, "lncipedia": None, @@ -64,51 +43,34 @@ def test_get_by_symbol(self, client): "locus_group": "protein-coding gene", "locus_type": "gene with protein product", "lsdbs": [], - "mamit_trnadb": None, "merops": None, - "mgds": [ - "MGI:99440" - ], + "mgds": ["MGI:99440"], "mirbase": None, "name": "CD33 molecule", - "omims": [ - 159590 - ], + "omims": [159590], "orphanet": None, "pre_symbols": [], - "prev_names": [ - "CD33 antigen (gp67)" - ], - "pseudogene_org": None, - "pubmeds": [ - 3139766, - 9465907 - ], - "refseqs": [ - "NM_001772" - ], - "rgds": [ - "RGD:1596020" - ], + "prev_names": ["CD33 antigen (gp67)"], + "pubmeds": [3139766, 9465907], + "refseqs": ["NM_001772"], + "rgds": ["RGD:1596020"], "rna_centrals": [], "snornabase": None, "status": "Approved", "symbol": "CD33", "ucsc_id": "uc002pwa.3", - "uniprots": [ - "P20138" - ], + "uniprots": ["P20138"], "uuid": "982e665a-2cc7-4ef3-b9f6-4406e08ed6c8", "vega_id": "OTTHUMG00000182891", - "version": 1702607059790331904 + "version": 1702607059790331904, } assert isinstance(output, dict) assert output == expected_results def test_get_uniprot_accession_by_hgnc_symbol(self, client): response = client.get( - 'api/v1/hgnc/uniprot_accession_by_hgnc_symbol?symbol=CD33', - content_type='application/json' + "api/v1/hgnc/uniprot_accession_by_hgnc_symbol?symbol=CD33", + content_type="application/json", ) output = format_response_data(response) assert isinstance(output, str) @@ -116,100 +78,65 @@ def test_get_uniprot_accession_by_hgnc_symbol(self, client): def test_get_hgnc(self, client): response = client.get( - 'api/v1/hgnc?symbol=CD33&page_size=10&page=1', - content_type='application/json' + "api/v1/hgnc?symbol=CD33&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "agr": "HGNC:1659", - "alias_names": [ - "sialic acid binding Ig-like lectin 3" - ], - "alias_symbols": [ - "SIGLEC3", - "SIGLEC-3", - "p67", - "FLJ00391" - ], - "bioparadigms_slc": None, - "ccdss": [ - "CCDS46157", - "CCDS54299", - "CCDS33084" - ], - "cd": "CD33", - "cosmic": None, - "date_approved_reserved": "1986-01-01", - "date_modified": "2016-10-05", - "date_name_changed": "2006-03-28", - "date_symbol_changed": None, - "enas": [ - "M23197" - ], - "ensembl_gene_id": "ENSG00000105383", - "entrez_id": 945, - "enzymes": [], - "gene_group_ids": [ - 471, - 590, - 745 - ], - "gene_group_names": [ - "CD molecules", - "V-set domain containing", - "Sialic acid binding Ig like lectins" - ], - "hgnc_id": "HGNC:1659", - "homeodb": None, - "horde_id": None, - "imgt": None, - "intermediate_filament_db": None, - "iuphar": "objectId:2601", - "kznf_gene_catalog": None, - "lncipedia": None, - "lncrnadb": None, - "location": "19q13.41", - "location_sortable": "19q13.41", - "locus_group": "protein-coding gene", - "locus_type": "gene with protein product", - "lsdbs": [], - "mamit_trnadb": None, - "merops": None, - "mgds": [ - "MGI:99440" - ], - "mirbase": None, - "name": "CD33 molecule", - "omims": [ - 159590 - ], - "orphanet": None, - "pre_symbols": [], - "prev_names": [ - "CD33 antigen (gp67)" - ], - "pseudogene_org": None, - "pubmeds": [ - 3139766, - 9465907 - ], - "refseqs": [ - "NM_001772" - ], - "rgds": [ - "RGD:1596020" - ], - "rna_centrals": [], - "snornabase": None, - "status": "Approved", - "symbol": "CD33", - "ucsc_id": "uc002pwa.3", - "uniprots": [ - "P20138" - ], - "uuid": "982e665a-2cc7-4ef3-b9f6-4406e08ed6c8", - "vega_id": "OTTHUMG00000182891", - "version": 1702607059790331904 + "agr": "HGNC:1659", + "alias_names": ["sialic acid binding Ig-like lectin 3"], + "alias_symbols": ["SIGLEC3", "SIGLEC-3", "p67", "FLJ00391"], + "bioparadigms_slc": None, + "ccdss": ["CCDS46157", "CCDS54299", "CCDS33084"], + "cd": "CD33", + "cosmic": None, + "date_approved_reserved": "1986-01-01", + "date_modified": "2016-10-05", + "date_name_changed": "2006-03-28", + "date_symbol_changed": None, + "enas": ["M23197"], + "ensembl_gene_id": "ENSG00000105383", + "entrez_id": 945, + "enzymes": [], + "gene_group_ids": [471, 590, 745], + "gene_group_names": [ + "CD molecules", + "V-set domain containing", + "Sialic acid binding Ig like lectins", + ], + "hgnc_id": "HGNC:1659", + "homeodb": None, + "horde_id": None, + "imgt": None, + "iuphar": "objectId:2601", + "kznf_gene_catalog": None, + "lncipedia": None, + "lncrnadb": None, + "location": "19q13.41", + "location_sortable": "19q13.41", + "locus_group": "protein-coding gene", + "locus_type": "gene with protein product", + "lsdbs": [], + "merops": None, + "mgds": ["MGI:99440"], + "mirbase": None, + "name": "CD33 molecule", + "omims": [159590], + "orphanet": None, + "pre_symbols": [], + "prev_names": ["CD33 antigen (gp67)"], + "pubmeds": [3139766, 9465907], + "refseqs": ["NM_001772"], + "rgds": ["RGD:1596020"], + "rna_centrals": [], + "snornabase": None, + "status": "Approved", + "symbol": "CD33", + "ucsc_id": "uc002pwa.3", + "uniprots": ["P20138"], + "uuid": "982e665a-2cc7-4ef3-b9f6-4406e08ed6c8", + "vega_id": "OTTHUMG00000182891", + "version": 1702607059790331904, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_intact.py b/tests/test_api/test_ebel/test_intact.py index b129ce6..ec5a8ee 100644 --- a/tests/test_api/test_ebel/test_intact.py +++ b/tests/test_api/test_ebel/test_intact.py @@ -1,27 +1,27 @@ """IntAct API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestIntAct: def test_get_by_uniprot(self, client): response = client.get( - 'api/v1/intact?confidence_value=0.56&detection_method=two%20hybrid%20array&detection_method_psimi_id=397&int_a_uniprot_id=P20138&int_b_uniprot_id=Q9Y5Z9&interaction_ids=%25EBI-23994924%25&interaction_type=physical%20association&interaction_type_psimi_id=915&page_size=10&page=1', - content_type='application/json' + "api/v1/intact?confidence_value=0.56&detection_method=two%20hybrid%20array&detection_method_psimi_id=397&int_a_uniprot_id=P20138&int_b_uniprot_id=Q9Y5Z9&interaction_ids=%25EBI-23994924%25&interaction_type=physical%20association&interaction_type_psimi_id=915&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "confidence_value": 0.56, - "detection_method": "two hybrid array", - "detection_method_psimi_id": 397, - "id": 681219, - "int_a_uniprot_id": "P20138", - "int_b_uniprot_id": "Q9Y5Z9", - "interaction_ids": "intact:EBI-23994924|imex:IM-25472-94969", - "interaction_type": "physical association", - "interaction_type_psimi_id": 915, - "pmid": 32296183 + "confidence_value": 0.56, + "detection_method": "two hybrid array", + "detection_method_psimi_id": 397, + "id": 681219, + "int_a_uniprot_id": "P20138", + "int_b_uniprot_id": "Q9Y5Z9", + "interaction_ids": "intact:EBI-23994924|imex:IM-25472-94969", + "interaction_type": "physical association", + "interaction_type_psimi_id": 915, + "pmid": 32296183, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_iuphar.py b/tests/test_api/test_ebel/test_iuphar.py index af74e53..5f76ed4 100644 --- a/tests/test_api/test_ebel/test_iuphar.py +++ b/tests/test_api/test_ebel/test_iuphar.py @@ -1,8 +1,7 @@ """IUPHAR API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE - +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS example_result = { "action": "Inhibition", @@ -43,57 +42,57 @@ "target_ligand_uniprot": None, "target_species": "Human", "target_uniprot": "Q9UBN7", - "type": "Inhibitor" + "type": "Inhibitor", } class TestIuphar: def test_get_interaction(self, client): response = client.get( - 'api/v1/iuphar/interaction?target=CD33&target_id=2601&target_gene_symbol=CD33&target_uniprot=P20138&target_ensembl_gene_id=ENSG00000105383&target_species=Human&ligand=gemtuzumab%20ozogamicin&ligand_id=6775&ligand_pubchem_sid=178103381&approved_drug=1&type=Antibody&action=Binding&selectivity=Selective&primary_target=true&original_affinity_relation=%3D&pubmed_id=10720144&page_size=10&page=1', - content_type='application/json' + "api/v1/iuphar/interaction?target=CD33&target_id=2601&target_gene_symbol=CD33&target_uniprot=P20138&target_ensembl_gene_id=ENSG00000105383&target_species=Human&ligand=gemtuzumab%20ozogamicin&ligand_id=6775&ligand_pubchem_sid=178103381&approved_drug=1&type=Antibody&action=Binding&selectivity=Selective&primary_target=true&original_affinity_relation=%3D&pubmed_id=10720144&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "action": "Binding", - "action_comment": None, - "affinity_high": None, - "affinity_low": None, - "affinity_median": None, - "affinity_units": "-", - "approved_drug": "1", - "assay_description": None, - "concentration_range": None, - "endogenous": False, - "id": 4512, - "ligand": "gemtuzumab ozogamicin", - "ligand_context": None, - "ligand_gene_symbol": None, - "ligand_id": 6775, - "ligand_pubchem_sid": 178103381, - "ligand_species": None, - "original_affinity_high_nm": None, - "original_affinity_low_nm": None, - "original_affinity_median_nm": None, - "original_affinity_relation": "=", - "original_affinity_units": "-", - "primary_target": True, - "pubmed_id": "10720144", - "receptor_site": None, - "selectivity": "Selective", - "target": "CD33", - "target_ensembl_gene_id": "ENSG00000105383", - "target_gene_symbol": "CD33", - "target_id": 2601, - "target_ligand": None, - "target_ligand_ensembl_gene_id": None, - "target_ligand_gene_symbol": None, - "target_ligand_id": None, - "target_ligand_pubchem_sid": None, - "target_ligand_uniprot": None, - "target_species": "Human", - "target_uniprot": "P20138", - "type": "Antibody" + "action": "Binding", + "action_comment": None, + "affinity_high": None, + "affinity_low": None, + "affinity_median": None, + "affinity_units": "-", + "approved_drug": "1", + "assay_description": None, + "concentration_range": None, + "endogenous": False, + "id": 4512, + "ligand": "gemtuzumab ozogamicin", + "ligand_context": None, + "ligand_gene_symbol": None, + "ligand_id": 6775, + "ligand_pubchem_sid": 178103381, + "ligand_species": None, + "original_affinity_high_nm": None, + "original_affinity_low_nm": None, + "original_affinity_median_nm": None, + "original_affinity_relation": "=", + "original_affinity_units": "-", + "primary_target": True, + "pubmed_id": "10720144", + "receptor_site": None, + "selectivity": "Selective", + "target": "CD33", + "target_ensembl_gene_id": "ENSG00000105383", + "target_gene_symbol": "CD33", + "target_id": 2601, + "target_ligand": None, + "target_ligand_ensembl_gene_id": None, + "target_ligand_gene_symbol": None, + "target_ligand_id": None, + "target_ligand_pubchem_sid": None, + "target_ligand_uniprot": None, + "target_species": "Human", + "target_uniprot": "P20138", + "type": "Antibody", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -104,40 +103,37 @@ def test_get_interaction(self, client): assert hit == expected_results def test_get_ligandby_by_id(self, client): - response = client.get( - 'api/v1/iuphar/ligand/by_id?id=1', - content_type='application/json' - ) + response = client.get("api/v1/iuphar/ligand/by_id?id=1", content_type="application/json") output = format_response_data(response) expected_result = { - "approved": None, - "gto_immu_pdb": None, - "gto_mpdb": None, - "id": 1, - "inchi": "InChI=1S/C22H26FN3O4/c23-17-6-4-16(5-7-17)22(28)24-8-9-25-10-12-26(13-11-25)19-2-1-3-20-21(19)29-15-18(14-27)30-20/h1-7,18,27H,8-15H2,(H,24,28)/t18-/m0/s1", - "inchi_key": "NYSDRDDQELAVKP-SFHVURJKSA-N", - "inn": "flesinoxan", - "iupac_name": "4-fluoro-N-[2-[4-[(3S)-3-(hydroxymethyl)-2,3-dihydro-1,4-benzodioxin-8-yl]piperazin-1-yl]ethyl]benzamide", - "labelled": None, - "name": "flesinoxan", - "pubchem_cid": "57347", - "pubchem_sid": 135650267, - "radioactive": None, - "smiles": "OC[C@H]1COc2c(O1)cccc2N1CCN(CC1)CCNC(=O)c1ccc(cc1)F", - "species": None, - "synonyms": "(+)-flesinoxan|DU-29,373", - "type": "Synthetic organic", - "uniprot_id": None, - "withdrawn": None + "approved": None, + "gto_immu_pdb": None, + "gto_mpdb": None, + "id": 1, + "inchi": "InChI=1S/C22H26FN3O4/c23-17-6-4-16(5-7-17)22(28)24-8-9-25-10-12-26(13-11-25)19-2-1-3-20-21(19)29-15-18(14-27)30-20/h1-7,18,27H,8-15H2,(H,24,28)/t18-/m0/s1", + "inchi_key": "NYSDRDDQELAVKP-SFHVURJKSA-N", + "inn": "flesinoxan", + "iupac_name": "4-fluoro-N-[2-[4-[(3S)-3-(hydroxymethyl)-2,3-dihydro-1,4-benzodioxin-8-yl]piperazin-1-yl]ethyl]benzamide", + "labelled": None, + "name": "flesinoxan", + "pubchem_cid": "57347", + "pubchem_sid": 135650267, + "radioactive": None, + "smiles": "OC[C@H]1COc2c(O1)cccc2N1CCN(CC1)CCNC(=O)c1ccc(cc1)F", + "species": None, + "synonyms": "(+)-flesinoxan|DU-29,373", + "type": "Synthetic organic", + "uniprot_id": None, + "withdrawn": None, } assert isinstance(output, dict) assert output == expected_result def test_get_interaction_by_target_uniprot(self, client): response = client.get( - 'api/v1/iuphar/interaction/by_target_uniprot?target_uniprot=P20138&page_size=10&page=1', - content_type='application/json' + "api/v1/iuphar/interaction/by_target_uniprot?target_uniprot=P20138&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -152,8 +148,8 @@ def test_get_interaction_by_target_uniprot(self, client): def test_get_interaction_by_target_gene_symbol(self, client): response = client.get( - 'api/v1/iuphar/interaction/by_target_gene_symbol?target_gene_symbol=HDAC6&page_size=10&page=1', - content_type='application/json' + "api/v1/iuphar/interaction/by_target_gene_symbol?target_gene_symbol=HDAC6&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) diff --git a/tests/test_api/test_ebel/test_kegg.py b/tests/test_api/test_ebel/test_kegg.py index b1ec495..8e71b59 100644 --- a/tests/test_api/test_ebel/test_kegg.py +++ b/tests/test_api/test_ebel/test_kegg.py @@ -1,27 +1,27 @@ """KEGG API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestKegg: def test_get_kegg(self, client): response = client.get( - 'api/v1/kegg?pathway_identifier=hsa05010&pathway_name=Alzheimer%20disease&kegg_species_id=hsa&kegg_gene_id_a=hsa%3A4137&gene_symbol_a=MAPT&kegg_gene_id_b=hsa%3A100532726&gene_symbol_b=NDUFC2-KCTD14&kegg_int_type=PPrel&interaction_type=inhibition&page_size=10&page=1', - content_type='application/json' + "api/v1/kegg?pathway_identifier=hsa05010&pathway_name=Alzheimer%20disease&kegg_species_id=hsa&kegg_gene_id_a=hsa%3A4137&gene_symbol_a=MAPT&kegg_gene_id_b=hsa%3A100532726&gene_symbol_b=NDUFC2-KCTD14&kegg_int_type=PPrel&interaction_type=inhibition&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "gene_symbol_a": "MAPT", - "gene_symbol_b": "NDUFC2-KCTD14", - "id": 95841, - "interaction_type": "inhibition", - "kegg_gene_id_a": "hsa:4137", - "kegg_gene_id_b": "hsa:100532726", - "kegg_int_type": "PPrel", - "kegg_species_id": "hsa", - "pathway_identifier": "hsa05010", - "pathway_name": "Alzheimer disease" + "gene_symbol_a": "MAPT", + "gene_symbol_b": "NDUFC2-KCTD14", + "id": 95841, + "interaction_type": "inhibition", + "kegg_gene_id_a": "hsa:4137", + "kegg_gene_id_b": "hsa:100532726", + "kegg_int_type": "PPrel", + "kegg_species_id": "hsa", + "pathway_identifier": "hsa05010", + "pathway_name": "Alzheimer disease", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -33,8 +33,7 @@ def test_get_kegg(self, client): def test_get_by_gene_symbol(self, client): response = client.get( - 'api/v1/kegg/by_gene_symbol?gene_symbol=CD19&page_size=10&page=1', - content_type='application/json' + "api/v1/kegg/by_gene_symbol?gene_symbol=CD19&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) example_results = { # Copy/paste example results to get col names quickly @@ -47,7 +46,7 @@ def test_get_by_gene_symbol(self, client): "kegg_int_type": "PPrel", "kegg_species_id": "hsa", "pathway_identifier": "hsa05010", - "pathway_name": "Alzheimer disease" + "pathway_name": "Alzheimer disease", } assert output[NUM_RESULTS] >= 36 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_mirtarbase.py b/tests/test_api/test_ebel/test_mirtarbase.py index a40f3cd..907c5ef 100644 --- a/tests/test_api/test_ebel/test_mirtarbase.py +++ b/tests/test_api/test_ebel/test_mirtarbase.py @@ -1,27 +1,27 @@ """miRTarBase API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestMirTarBase: def test_get_mirtarbase(self, client): response = client.get( - 'api/v1/mirtarbase?mi_rtar_base_id=MIRT019168&mi_rna=hsa-miR-335-5p&species_mi_rna=Homo%20sapiens&target_gene=CD33&target_gene_entrez_id=945&species_target_gene=Homo%20sapiens&experiments=Microarray&support_type=Functional%20MTI%20%28Weak%29&references_pmid=18185580&page_size=10&page=1', - content_type='application/json' + "api/v1/mirtarbase?mi_rtar_base_id=MIRT019168&mi_rna=hsa-miR-335-5p&species_mi_rna=Homo%20sapiens&target_gene=CD33&target_gene_entrez_id=945&species_target_gene=Homo%20sapiens&experiments=Microarray&support_type=Functional%20MTI%20%28Weak%29&references_pmid=18185580&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "experiments": "Microarray", - "id": 15465, - "mi_rna": "hsa-miR-335-5p", - "mi_rtar_base_id": "MIRT019168", - "references_pmid": 18185580, - "species_mi_rna": "Homo sapiens", - "species_target_gene": "Homo sapiens", - "support_type": "Functional MTI (Weak)", - "target_gene": "CD33", - "target_gene_entrez_id": 945 + "experiments": "Microarray", + "id": 15465, + "mi_rna": "hsa-miR-335-5p", + "mi_rtar_base_id": "MIRT019168", + "references_pmid": 18185580, + "species_mi_rna": "Homo sapiens", + "species_target_gene": "Homo sapiens", + "support_type": "Functional MTI (Weak)", + "target_gene": "CD33", + "target_gene_entrez_id": 945, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_ncbi.py b/tests/test_api/test_ebel/test_ncbi.py index a227cc4..e44ce26 100644 --- a/tests/test_api/test_ebel/test_ncbi.py +++ b/tests/test_api/test_ebel/test_ncbi.py @@ -1,8 +1,7 @@ """NCBI API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE - +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS # class TestNcbi: # def test_get_gene_by_go(self, client): diff --git a/tests/test_api/test_ebel/test_nsides.py b/tests/test_api/test_ebel/test_nsides.py index e47df52..f1b2fd5 100644 --- a/tests/test_api/test_ebel/test_nsides.py +++ b/tests/test_api/test_ebel/test_nsides.py @@ -1,14 +1,14 @@ """NSIDES API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestNsides: def test_get_nsides(self, client): response = client.get( - 'api/v1/nsides?drug_rxnorn_id=4024&drug_concept_name=ergoloid%20mesylates%2C%20USP&condition_meddra_id=10002034&condition_concept_name=Anaemia&a=6&b=126&c=21&d=1299&prr=2.85714&prr_error=0.45382&mean_reporting_frequency=0.0454545&page_size=10&page=1', - content_type='application/json' + "api/v1/nsides?drug_rxnorn_id=4024&drug_concept_name=ergoloid%20mesylates%2C%20USP&condition_meddra_id=10002034&condition_concept_name=Anaemia&a=6&b=126&c=21&d=1299&prr=2.85714&prr_error=0.45382&mean_reporting_frequency=0.0454545&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { @@ -24,7 +24,7 @@ def test_get_nsides(self, client): "mean_reporting_frequency": 0.0454545, "prr": 2.85714, "prr_error": 0.45382, - "source": "offsides" + "source": "offsides", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_pathway_commons.py b/tests/test_api/test_ebel/test_pathway_commons.py index 701689b..a1873da 100644 --- a/tests/test_api/test_ebel/test_pathway_commons.py +++ b/tests/test_api/test_ebel/test_pathway_commons.py @@ -1,28 +1,24 @@ """Pathway Commons API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestPathwayCommons: def test_get_pathway_commons(self, client): response = client.get( - 'api/v1/pathway_commons?participant_a=CSF3&interaction_type=controls-expression-of&participant_b=CD33&page_size=10&page=1', - content_type='application/json' + "api/v1/pathway_commons?participant_a=CSF3&interaction_type=controls-expression-of&participant_b=CD33&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "id": 804357, - "interaction_type": "controls-expression-of", - "participant_a": "CSF3", - "participant_b": "CD33", - "pathway_names": [], - "pmids": [ - 12627849 - ], - "sources": [ - "CTD" - ] + "id": 804357, + "interaction_type": "controls-expression-of", + "participant_a": "CSF3", + "participant_b": "CD33", + "pathway_names": [], + "pmids": [12627849], + "sources": ["CTD"], } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -34,12 +30,19 @@ def test_get_pathway_commons(self, client): def test_get_by_gene_symbol(self, client): response = client.get( - 'api/v1/pathway_commons/by_gene_symbol?gene_symbol=CD33&page_size=10&page=1', - content_type='application/json' + "api/v1/pathway_commons/by_gene_symbol?gene_symbol=CD33&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_cols = ("id", "interaction_type", "participant_a", "participant_b", "pathway_names", "pmids", - "sources") + expected_cols = ( + "id", + "interaction_type", + "participant_a", + "participant_b", + "pathway_names", + "pmids", + "sources", + ) assert output[NUM_RESULTS] >= 19 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -51,22 +54,18 @@ def test_get_by_gene_symbol(self, client): def test_get_by_pathway_name(self, client): response = client.get( - 'api/v1/pathway_commons/by_pathway_name?pathway_name=Activation%20and%20oligomerization%20of%20BAK%20protein&page_size=10&page=1', - content_type='application/json' + "api/v1/pathway_commons/by_pathway_name?pathway_name=Activation%20and%20oligomerization%20of%20BAK%20protein&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_result = { - "id": 69305, - "interaction_type": "in-complex-with", - "participant_a": "BAK1", - "participant_b": "BID", - "pathway_names": [ - "Activation and oligomerization of BAK protein" - ], - "pmids": [], - "sources": [ - "Reactome" - ] + "id": 69305, + "interaction_type": "in-complex-with", + "participant_a": "BAK1", + "participant_b": "BID", + "pathway_names": ["Activation and oligomerization of BAK protein"], + "pmids": [], + "sources": ["Reactome"], } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -78,12 +77,18 @@ def test_get_by_pathway_name(self, client): def test_get_by_pmid(self, client): response = client.get( - 'api/v1/pathway_commons/by_pmid?pmid=12627849&page_size=10&page=1', - content_type='application/json' + "api/v1/pathway_commons/by_pmid?pmid=12627849&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) - expected_cols = ("id", "interaction_type", "participant_a", "participant_b", "pathway_names", "pmids", - "sources") + expected_cols = ( + "id", + "interaction_type", + "participant_a", + "participant_b", + "pathway_names", + "pmids", + "sources", + ) assert output[NUM_RESULTS] >= 6 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -95,8 +100,8 @@ def test_get_by_pmid(self, client): def test_get_pathway_name_starts_with(self, client): response = client.get( - 'api/v1/pathway_commons/pathway_name/starts_with?pathway_name=Activation%20and%20oligomerization%20of%20BAK&page_size=10&page=1', - content_type='application/json' + "api/v1/pathway_commons/pathway_name/starts_with?pathway_name=Activation%20and%20oligomerization%20of%20BAK&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) assert output[NUM_RESULTS] == 1 diff --git a/tests/test_api/test_ebel/test_protein_atlas.py b/tests/test_api/test_ebel/test_protein_atlas.py index e09c4f6..0f89691 100644 --- a/tests/test_api/test_ebel/test_protein_atlas.py +++ b/tests/test_api/test_ebel/test_protein_atlas.py @@ -1,23 +1,23 @@ """Protein Atlas API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestProteinAtlas: def test_get_rna_brain_fantom(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/rna_brain_fantom?gene=ENSG00000105383&gene_name=CD33&brain_region=amygdala&tags_per_million=10.7&scaled_tags_per_million=12.4&nx=3.6&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/rna_brain_fantom?gene=ENSG00000105383&gene_name=CD33&brain_region=amygdala&tags_per_million=10.7&scaled_tags_per_million=12.4&nx=3.6&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "brain_region": "amygdala", - "gene": "ENSG00000105383", - "gene_name": "CD33", - "nx": "3.6", - "scaled_tags_per_million": "12.4", - "tags_per_million": "10.7" + "brain_region": "amygdala", + "gene": "ENSG00000105383", + "gene_name": "CD33", + "nx": "3.6", + "scaled_tags_per_million": "12.4", + "tags_per_million": "10.7", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -29,15 +29,15 @@ def test_get_rna_brain_fantom(self, client): def test_get_rna_mouse_brain_allen(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/rna_mouse_brain_allen?gene=ENSG00000095970&gene_name=TREM2&brain_region=amygdala&expression_energy=0.5&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/rna_mouse_brain_allen?gene=ENSG00000095970&gene_name=TREM2&brain_region=amygdala&expression_energy=0.5&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "brain_region": "amygdala", - "expression_energy": 0.5, - "gene": "ENSG00000095970", - "gene_name": "TREM2" + "brain_region": "amygdala", + "expression_energy": 0.5, + "gene": "ENSG00000095970", + "gene_name": "TREM2", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -49,17 +49,17 @@ def test_get_rna_mouse_brain_allen(self, client): def test_get_normal_tissue(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/normal_tissue?gene=ENSG00000105383&gene_name=CD33&tissue=adipose%20tissue&cell_type=adipocytes&level=Not%20detected&reliability=Approved&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/normal_tissue?gene=ENSG00000105383&gene_name=CD33&tissue=adipose%20tissue&cell_type=adipocytes&level=Not%20detected&reliability=Approved&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "cell_type": "adipocytes", - "gene": "ENSG00000105383", - "gene_name": "CD33", - "level": "Not detected", - "reliability": "Approved", - "tissue": "adipose tissue" + "cell_type": "adipocytes", + "gene": "ENSG00000105383", + "gene_name": "CD33", + "level": "Not detected", + "reliability": "Approved", + "tissue": "adipose tissue", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -71,25 +71,25 @@ def test_get_normal_tissue(self, client): def test_get_subcellular_location(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/rna_subcellular_location?gene=ENSG00000105383&gene_name=CD33&reliability=Approved&main_location=Nucleoplasm%3BPlasma%20membrane&supported=Plasma%20membrane&uncertain=Nucleoplasm&go_id=Nucleoplasm%20%28GO%3A0005654%29%3BPlasma%20membrane%20%28GO%3A0005886%29&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/rna_subcellular_location?gene=ENSG00000105383&gene_name=CD33&reliability=Approved&main_location=Nucleoplasm%3BPlasma%20membrane&supported=Plasma%20membrane&uncertain=Nucleoplasm&go_id=Nucleoplasm%20%28GO%3A0005654%29%3BPlasma%20membrane%20%28GO%3A0005886%29&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "additional_location": None, - "approved": None, - "cell_cycle_dependency": None, - "enhanced": None, - "extracellular_location": None, - "gene": "ENSG00000105383", - "gene_name": "CD33", - "go_id": "Nucleoplasm (GO:0005654);Plasma membrane (GO:0005886)", - "main_location": "Nucleoplasm;Plasma membrane", - "reliability": "Approved", - "single_cell_variation_intensity": None, - "single_cell_variation_spatial": None, - "supported": "Plasma membrane", - "uncertain": "Nucleoplasm" + "additional_location": None, + "approved": None, + "cell_cycle_dependency": None, + "enhanced": None, + "extracellular_location": None, + "gene": "ENSG00000105383", + "gene_name": "CD33", + "go_id": "Nucleoplasm (GO:0005654);Plasma membrane (GO:0005886)", + "main_location": "Nucleoplasm;Plasma membrane", + "reliability": "Approved", + "single_cell_variation_intensity": None, + "single_cell_variation_spatial": None, + "supported": "Plasma membrane", + "uncertain": "Nucleoplasm", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 @@ -101,16 +101,11 @@ def test_get_subcellular_location(self, client): def test_get_rna_tissue_consensus(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/rna_tissue_consensus?gene=ENSG00000105383&gene_name=CD33&tissue=adipose%20tissue&nx=10.2&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/rna_tissue_consensus?gene=ENSG00000105383&gene_name=CD33&tissue=adipose%20tissue&nx=10.2&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "gene": "ENSG00000105383", - "gene_name": "CD33", - "nx": 10.2, - "tissue": "adipose tissue" - } + expected_results = {"gene": "ENSG00000105383", "gene_name": "CD33", "nx": 10.2, "tissue": "adipose tissue"} assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -121,17 +116,17 @@ def test_get_rna_tissue_consensus(self, client): def test_get_rna_brain_gtex(self, client): response = client.get( - 'api/v1/ebel/protein_atlas/rna_brain_gtex?gene=ENSG00000000003&gene_name=TSPAN6&brain_region=amygdala&tpm=7.3&p_tpm=9.0&nx=7.0&page_size=10&page=1', - content_type='application/json' + "api/v1/ebel/protein_atlas/rna_brain_gtex?gene=ENSG00000000003&gene_name=TSPAN6&brain_region=amygdala&tpm=7.3&p_tpm=9.0&nx=7.0&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "brain_region": "amygdala", - "gene": "ENSG00000000003", - "gene_name": "TSPAN6", - "nx": 7, - "p_tpm": 9, - "tpm": 7.3 + "brain_region": "amygdala", + "gene": "ENSG00000000003", + "gene_name": "TSPAN6", + "nx": 7, + "p_tpm": 9, + "tpm": 7.3, } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_reactome.py b/tests/test_api/test_ebel/test_reactome.py index fb29d8b..be639f3 100644 --- a/tests/test_api/test_ebel/test_reactome.py +++ b/tests/test_api/test_ebel/test_reactome.py @@ -1,23 +1,23 @@ """Reactome API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestReactome: def test_get_reactome(self, client): response = client.get( - 'api/v1/reactome?identifier=R-HSA-198933&uniprot_accession=P20138&organism=Homo%20sapiens&name=Immunoregulatory%20interactions%20between%20a%20Lymphoid%20and%20a%20non-Lymphoid%20cell&evidence_type=TAS&page_size=10&page=1', - content_type='application/json' + "api/v1/reactome?identifier=R-HSA-198933&uniprot_accession=P20138&organism=Homo%20sapiens&name=Immunoregulatory%20interactions%20between%20a%20Lymphoid%20and%20a%20non-Lymphoid%20cell&evidence_type=TAS&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "evidence_type": "TAS", - "id": 140784, - "identifier": "R-HSA-198933", - "name": "Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell", - "organism": "Homo sapiens", - "uniprot_accession": "P20138" + "evidence_type": "TAS", + "id": 140784, + "identifier": "R-HSA-198933", + "name": "Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell", + "organism": "Homo sapiens", + "uniprot_accession": "P20138", } assert output[NUM_RESULTS] == 1 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_api/test_ebel/test_stringdb.py b/tests/test_api/test_ebel/test_stringdb.py index fd971ac..5077416 100644 --- a/tests/test_api/test_ebel/test_stringdb.py +++ b/tests/test_api/test_ebel/test_stringdb.py @@ -1,14 +1,14 @@ """StringDB API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestStringdb: def test_get_stringdb(self, client): response = client.get( - 'api/v1/string?protein1=9606.ENSP00000000412&protein2=9606.ENSP00000000233&symbol1=M6PR&symbol2=ARF5&neighborhood=0&neighborhood_transferred=0&fusion=0&cooccurence=0&homology=0&coexpression=82&coexpression_transferred=61&experiments=0&experiments_transferred=0&database=0&database_transferred=0&textmining=105&textmining_transferred=0&combined_score=161&page_size=10&page=1', - content_type='application/json' + "api/v1/string?protein1=9606.ENSP00000000412&protein2=9606.ENSP00000000233&symbol1=M6PR&symbol2=ARF5&neighborhood=0&neighborhood_transferred=0&fusion=0&cooccurence=0&homology=0&coexpression=82&coexpression_transferred=61&experiments=0&experiments_transferred=0&database=0&database_transferred=0&textmining=105&textmining_transferred=0&combined_score=161&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) @@ -30,7 +30,7 @@ def test_get_stringdb(self, client): "symbol1": "M6PR", "symbol2": "ARF5", "textmining": 105, - "textmining_transferred": 0 + "textmining_transferred": 0, } assert output[NUM_RESULTS] == 1 @@ -43,21 +43,21 @@ def test_get_stringdb(self, client): def test_get_stringdb_action(self, client): response = client.get( - 'api/v1/string/action?item_id_a=9606.ENSP00000216366&item_id_b=9606.ENSP00000000233&symbol1=AP4S1&symbol2=ARF5&mode=binding&score=165&page_size=10&page=1', - content_type='application/json' + "api/v1/string/action?item_id_a=9606.ENSP00000216366&item_id_b=9606.ENSP00000000233&symbol1=AP4S1&symbol2=ARF5&mode=binding&score=165&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { - "a_is_acting": False, - "action": None, - "is_directional": False, - "item_id_a": "9606.ENSP00000216366", - "item_id_b": "9606.ENSP00000000233", - "mode": "binding", - "score": 165, - "symbol1": "AP4S1", - "symbol2": "ARF5" + "a_is_acting": False, + "action": None, + "is_directional": False, + "item_id_a": "9606.ENSP00000216366", + "item_id_b": "9606.ENSP00000000233", + "mode": "binding", + "score": 165, + "symbol1": "AP4S1", + "symbol2": "ARF5", } assert output[NUM_RESULTS] == 1 @@ -67,4 +67,3 @@ def test_get_stringdb_action(self, client): hit = results[0] assert isinstance(hit, dict) assert hit == expected_results - diff --git a/tests/test_api/test_ebel/test_uniprot.py b/tests/test_api/test_ebel/test_uniprot.py index 9a76883..852f67f 100644 --- a/tests/test_api/test_ebel/test_uniprot.py +++ b/tests/test_api/test_ebel/test_uniprot.py @@ -1,22 +1,19 @@ """UniProt API unit tests.""" from .conftest import format_response_data -from .constants import RESULTS, NUM_RESULTS, PAGE_SIZE +from .constants import NUM_RESULTS, PAGE_SIZE, RESULTS class TestUniprot: def test_get_uniprot(self, client): response = client.get( - 'api/v1/uniprot?accession=P20138&gene_symbol=CD33&taxonomy_id=9606&keyword=Phosphoprotein&xref_db=ChEMBL&xref_id=CHEMBL1842&subcellular_location=Peroxisome&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot?accession=P20138&gene_symbol=CD33&taxonomy_id=9606&keyword=Phosphoprotein&xref_db=ChEMBL&xref_id=CHEMBL1842&subcellular_location=Peroxisome&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_sample = { # Only check a subset of data "accession": "P20138", - "gene_names": [ - "CD33", - "SIGLEC3" - ], + "gene_names": ["CD33", "SIGLEC3"], "gene_symbol": "CD33", "taxid": 9606, "name": "CD33_HUMAN", @@ -33,8 +30,7 @@ def test_get_uniprot(self, client): def test_get_keyword_starts_with(self, client): response = client.get( - 'api/v1/uniprot/keyword/starts_with?keyword=Phago&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/keyword/starts_with?keyword=Phago&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) assert output[NUM_RESULTS] == 1 @@ -46,8 +42,8 @@ def test_get_keyword_starts_with(self, client): def test_get_subcellular_location_starts_with(self, client): response = client.get( - 'api/v1/uniprot/subcellular_location/starts_with?subcellular_location=Endo&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/subcellular_location/starts_with?subcellular_location=Endo&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) assert output[NUM_RESULTS] == 10 @@ -59,14 +55,11 @@ def test_get_subcellular_location_starts_with(self, client): def test_get_gene_symbol_starts_with(self, client): response = client.get( - 'api/v1/uniprot/gene_symbol/starts_with?gene_symbol=CD18&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/gene_symbol/starts_with?gene_symbol=CD18&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "CD180": 2690, - "Cd180": 22498 - } + expected_results = {"CD180": 2690, "Cd180": 22498} assert output[NUM_RESULTS] == 2 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -75,16 +68,10 @@ def test_get_gene_symbol_starts_with(self, client): def test_get_gene_starts_with(self, client): response = client.get( - 'api/v1/uniprot/gene/starts_with?gene=CD11&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/gene/starts_with?gene=CD11&page_size=10&page=1", content_type="application/json" ) output = format_response_data(response) - expected_results = { - "CD1176A": 294700, - "CD11A": 1040093, - "CD11B": 1040099, - "CD11C": 1040111 - } + expected_results = {"CD1176A": 294700, "CD11A": 1040093, "CD11B": 1040099, "CD11C": 1040111} assert output[NUM_RESULTS] == 4 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -93,14 +80,11 @@ def test_get_gene_starts_with(self, client): def test_get_organism_starts_with(self, client): response = client.get( - 'api/v1/uniprot/organism/starts_with?organism=Homo%20sapiens&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/organism/starts_with?organism=Homo%20sapiens&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) - expected_results = { - "Homo sapiens": 9606, - "Homo sapiens neanderthalensis": 63221 - } + expected_results = {"Homo sapiens": 9606, "Homo sapiens neanderthalensis": 63221} assert output[NUM_RESULTS] == 2 assert output[PAGE_SIZE] == 10 results = output[RESULTS] @@ -109,13 +93,13 @@ def test_get_organism_starts_with(self, client): def test_get_function_starts_with(self, client): response = client.get( - 'api/v1/uniprot/function/starts_with?description=Phosphatase%202A&page_size=10&page=1', - content_type='application/json' + "api/v1/uniprot/function/starts_with?description=Phosphatase%202A&page_size=10&page=1", + content_type="application/json", ) output = format_response_data(response) expected_results = { "Phosphatase 2A affects a variety of biological processes in the cell such as transcription, cell cycle progression and cellular morphogenesis, and provides an initial identification of critical substrates for this phosphatase. The regulatory subunit may direct the catalytic subunit to distinct, albeit overlapping, subsets of substrates (By similarity).": 15, - "Phosphatase 2A affects a variety of biological processes in the cell such as transcription, cell cycle progression and cellular morphogenesis, and provides an initial identification of critical substrates for this phosphatase. The regulatory subunit may direct the catalytic subunit to distinct, albeit overlapping, subsets of substrates.": 16 + "Phosphatase 2A affects a variety of biological processes in the cell such as transcription, cell cycle progression and cellular morphogenesis, and provides an initial identification of critical substrates for this phosphatase. The regulatory subunit may direct the catalytic subunit to distinct, albeit overlapping, subsets of substrates.": 16, } assert output[NUM_RESULTS] == 2 assert output[PAGE_SIZE] == 10 diff --git a/tests/test_grammar/test_validate.py b/tests/test_grammar/test_validate.py index 2e9057f..d91c1ea 100644 --- a/tests/test_grammar/test_validate.py +++ b/tests/test_grammar/test_validate.py @@ -2,11 +2,11 @@ import os import pathlib + import pandas as pd from ebel.validate import validate_bel_file - VALIDATION_TEST_DIR = pathlib.Path(__file__).parent.absolute() TEST_DATA_DIR = os.path.join(VALIDATION_TEST_DIR, "..", "data") diff --git a/tests/test_manager/constants.py b/tests/test_manager/constants.py index 15fd7e4..006c61e 100644 --- a/tests/test_manager/constants.py +++ b/tests/test_manager/constants.py @@ -3,8 +3,8 @@ import pathlib from ebel import Bel -from ebel.constants import DEFAULT_ODB from ebel.config import get_config_value +from ebel.constants import DEFAULT_ODB from ebel.manager.orientdb.odb_meta import Graph # Paths @@ -16,12 +16,12 @@ TEST_JSON = TEST_BEL + ".json" # Parameters for TEST database -USER = get_config_value(DEFAULT_ODB, 'user') -PASSWORD = get_config_value(DEFAULT_ODB, 'password') -DB_NAME = 'ebel_test' -SERVER = 'localhost' +USER = get_config_value(DEFAULT_ODB, "user") +PASSWORD = get_config_value(DEFAULT_ODB, "password") +DB_NAME = "ebel_test" +SERVER = "localhost" PORT = "2424" -ROOT_PWD = get_config_value(DEFAULT_ODB, 'root_password') +ROOT_PWD = get_config_value(DEFAULT_ODB, "root_password") if ROOT_PWD is None: raise ValueError("Need root password to perform tests. Please add 'root_password' to configuration file") @@ -33,7 +33,7 @@ "password": PASSWORD, "server": SERVER, "port": PORT, - "root_password": ROOT_PWD + "root_password": ROOT_PWD, } test_client = Bel(graph_config=config_params) diff --git a/tests/test_manager/test_bel.py b/tests/test_manager/test_bel.py index 6af3972..8efa579 100644 --- a/tests/test_manager/test_bel.py +++ b/tests/test_manager/test_bel.py @@ -1,7 +1,7 @@ """Bel module tests.""" -from .constants import test_client, TEST_JSON -from .true_values import NODES_EXTENSION, EDGES_EXTENSION +from .constants import TEST_JSON, test_client +from .true_values import EDGES_EXTENSION, NODES_EXTENSION bel = test_client @@ -24,27 +24,27 @@ class TestBel: def test_bel_properties(self): """Test that all properties are present.""" - assert hasattr(bel, 'hgnc') - assert hasattr(bel, 'uniprot') + assert hasattr(bel, "hgnc") + assert hasattr(bel, "uniprot") # assert hasattr(bel, 'dea') - assert hasattr(bel, 'drugbank') - assert hasattr(bel, 'gwas_catalog') - assert hasattr(bel, 'reactome') - assert hasattr(bel, 'biogrid') - assert hasattr(bel, 'stringdb') - assert hasattr(bel, 'clinical_trials') - assert hasattr(bel, 'intact') - assert hasattr(bel, 'clinvar') - assert hasattr(bel, 'mirtarbase') - assert hasattr(bel, 'disgenet') - assert hasattr(bel, 'pathway_commons') - assert hasattr(bel, 'kegg') - assert hasattr(bel, 'ensembl') - assert hasattr(bel, 'iuphar') - assert hasattr(bel, 'chebi') - assert hasattr(bel, 'nsides') - assert hasattr(bel, 'ncbi') - assert hasattr(bel, 'protein_atlas') + assert hasattr(bel, "drugbank") + assert hasattr(bel, "gwas_catalog") + assert hasattr(bel, "reactome") + assert hasattr(bel, "biogrid") + assert hasattr(bel, "stringdb") + assert hasattr(bel, "clinical_trials") + assert hasattr(bel, "intact") + assert hasattr(bel, "clinvar") + assert hasattr(bel, "mirtarbase") + assert hasattr(bel, "disgenet") + assert hasattr(bel, "pathway_commons") + assert hasattr(bel, "kegg") + assert hasattr(bel, "ensembl") + assert hasattr(bel, "iuphar") + assert hasattr(bel, "chebi") + assert hasattr(bel, "nsides") + assert hasattr(bel, "ncbi") + assert hasattr(bel, "protein_atlas") def test_import_json(self): """Tests update_from_protein2gene feature of import_json.""" @@ -55,9 +55,7 @@ def test_import_json(self): assert all([count == 0 for count in bel.number_of_nodes.values()]) assert all([count == 0 for count in bel.number_of_edges.values()]) - files_imported = bel.import_json(input_path=TEST_JSON, - update_from_protein2gene=True, - extend_graph=False) + files_imported = bel.import_json(input_path=TEST_JSON, update_from_protein2gene=True, extend_graph=False) assert len(files_imported) == 1 assert bel.number_of_nodes == NODES_EXTENSION diff --git a/tests/test_manager/test_biodbs/test_enrich.py b/tests/test_manager/test_biodbs/test_enrich.py index 49c0d6d..155e926 100644 --- a/tests/test_manager/test_biodbs/test_enrich.py +++ b/tests/test_manager/test_biodbs/test_enrich.py @@ -1,12 +1,11 @@ """Unit tests for the enrich methods.""" -from ..constants import test_client, TEST_JSON +from ..constants import TEST_JSON, test_client bel = test_client class TestEnrich: - # TODO: Erasing and reimporting causes an error when these tests are run with test_bel.py # Make sure test DB is populated bel.clear_all_nodes_and_edges() @@ -15,16 +14,18 @@ class TestEnrich: assert all([count == 0 for count in bel.number_of_nodes.values()]) assert all([count == 0 for count in bel.number_of_edges.values()]) - bel.import_json(input_path=TEST_JSON, - update_from_protein2gene=True, - extend_graph=False,) + bel.import_json( + input_path=TEST_JSON, + update_from_protein2gene=True, + extend_graph=False, + ) def test_uniprot_enrich(self): updated = bel.uniprot.update_bel() assert isinstance(updated, dict) assert all([species in updated for species in ("HGNC", "MGI", "RGD")]) assert all([isinstance(val, int) for val in updated.values()]) - assert updated['HGNC'] == 3 + assert updated["HGNC"] == 3 def test_hgnc_enrich(self): updated = bel.hgnc.update_bel() @@ -39,7 +40,7 @@ def test_chebi_enrich(self): def test_gwas_enrich(self): updated = bel.gwas_catalog.update_bel() assert isinstance(updated, dict) - assert updated == {'has_mapped_snp_gc': 1141, 'has_downstream_snp_gc': 401, 'has_upstream_snp_gc': 394} + assert updated == {"has_mapped_snp_gc": 1141, "has_downstream_snp_gc": 401, "has_upstream_snp_gc": 394} def test_clinvar_enrich(self): updated = bel.clinvar.update_bel() @@ -54,7 +55,7 @@ def test_reactome_enrich(self): def test_stringdb_enrich(self): updated = bel.stringdb.update_interactions() assert isinstance(updated, dict) - assert updated == {'interactions': 43, 'actions': 152} + assert updated == {"interactions": 43, "actions": 152} def test_biogrid_enrich(self): updated = bel.biogrid.update_interactions() @@ -69,9 +70,11 @@ def test_mirtarbase_enrich(self): def test_pc_enrich(self): updated = bel.pathway_commons.update_interactions() assert isinstance(updated, dict) - assert updated == {'controls-transport-of': 19, - 'controls-expression-of': 71, - 'controls-phosphorylation-of': 12} + assert updated == { + "controls-transport-of": 19, + "controls-expression-of": 71, + "controls-phosphorylation-of": 12, + } def test_disgenet_enrich(self): updated = bel.disgenet.update_interactions() diff --git a/tests/test_manager/test_biodbs/test_urls.py b/tests/test_manager/test_biodbs/test_urls.py index dc08d0e..e72ba0f 100644 --- a/tests/test_manager/test_biodbs/test_urls.py +++ b/tests/test_manager/test_biodbs/test_urls.py @@ -1,12 +1,18 @@ """Unit tests for checking the URLs.""" import ftplib + import requests +from ebel.manager.orientdb.constants import (BIOGRID, CHEBI, CLINICAL_TRIALS, + CLINVAR, DISGENET, DRUGBANK, + ENSEMBL, GWAS_CATALOG, HGNC, + INTACT, IUPHAR, KEGG, MIRTARBASE, + NCBI, NSIDES, PATHWAY_COMMONS, + PROTEIN_ATLAS, REACTOME, STRINGDB, + UNIPROT) + from ..constants import test_client -from ebel.manager.orientdb.constants import DRUGBANK, HGNC, CHEBI, ENSEMBL, GWAS_CATALOG, CLINVAR, UNIPROT, REACTOME, \ - STRINGDB, INTACT, BIOGRID, MIRTARBASE, PATHWAY_COMMONS, DISGENET, KEGG, IUPHAR, NSIDES, CLINICAL_TRIALS, \ - PROTEIN_ATLAS, NCBI bel = test_client @@ -33,14 +39,14 @@ def check_links(urls: dict): continue headers = requests.head(url, allow_redirects=True).headers - if 'Content-disposition' in headers: # Check file attachment - assert "filename" in headers.get('Content-disposition') + if "Content-disposition" in headers: # Check file attachment + assert "filename" in headers.get("Content-disposition") elif "Content-Encoding" in headers: assert headers.get("Content-Encoding") == "gzip" # Check if it's a gzip file else: - file_size = int(headers.get('Content-Length')) + file_size = int(headers.get("Content-Length")) assert file_size > 0 def test_urls(self): diff --git a/tests/test_manager/test_odb_meta.py b/tests/test_manager/test_odb_meta.py index ac27b70..0887b5b 100644 --- a/tests/test_manager/test_odb_meta.py +++ b/tests/test_manager/test_odb_meta.py @@ -1,6 +1,6 @@ """ODB interface tests.""" -from .constants import test_client, USER, PASSWORD, SERVER, PORT, DB_NAME +from .constants import DB_NAME, PASSWORD, PORT, SERVER, USER, test_client bel = test_client @@ -11,11 +11,13 @@ class TestConfig: def test_connection_params(self): """Test that the proper connection parameters are being called and used during client initialization.""" default_params = {"name": DB_NAME, "user": USER, "password": PASSWORD, "server": SERVER, "port": PORT} - client_config = {"name": bel.odb_db_name, - "user": bel.odb_user, - "password": bel.odb_password, - "server": bel.odb_server, - "port": bel.odb_port} + client_config = { + "name": bel.odb_db_name, + "user": bel.odb_user, + "password": bel.odb_password, + "server": bel.odb_server, + "port": bel.odb_port, + } # Check that the client configuration paramaters match the default ones initialized for testing assert all([client_config[param] == value for param, value in default_params.items()]) diff --git a/tests/test_manager/true_values.py b/tests/test_manager/true_values.py index 1020bb4..036212c 100644 --- a/tests/test_manager/true_values.py +++ b/tests/test_manager/true_values.py @@ -2,120 +2,120 @@ # True values after importing basic_import_test.bel.json with only extension - test_import_json NODES_EXTENSION = { - 'bel': 12, - 'nn': 12, - 'pure_object': 11, - 'location_object': 10, - 'bio_concept': 0, - 'biological_process': 0, - 'pathology': 0, - 'bio_object': 11, - 'complex': 0, - 'abundance': 1, - 'population': 0, - 'genetic_flow': 10, - 'gene': 4, - 'rna': 3, - 'protein': 3, - 'micro_rna': 0, - 'bio_act': 1, - 'activity': 1, - 'reaction': 0, - 'degradation': 0, - 'cell_secretion': 0, - 'translocation': 0, - 'cell_surface_expression': 0, - 'bio_list': 0, - 'list': 0, - 'composite': 0, - 'ebel': 0, - 'variant': 0, - 'fragment': 0, - 'location': 0, - 'pmod': 0, - 'gmod': 0, - 'from_location': 0, - 'to_location': 0, - 'reactants': 0, - 'products': 0, - 'fusion_protein': 0, - 'fusion_rna': 0, - 'fusion_gene': 0 + "bel": 12, + "nn": 12, + "pure_object": 11, + "location_object": 10, + "bio_concept": 0, + "biological_process": 0, + "pathology": 0, + "bio_object": 11, + "complex": 0, + "abundance": 1, + "population": 0, + "genetic_flow": 10, + "gene": 4, + "rna": 3, + "protein": 3, + "micro_rna": 0, + "bio_act": 1, + "activity": 1, + "reaction": 0, + "degradation": 0, + "cell_secretion": 0, + "translocation": 0, + "cell_surface_expression": 0, + "bio_list": 0, + "list": 0, + "composite": 0, + "ebel": 0, + "variant": 0, + "fragment": 0, + "location": 0, + "pmod": 0, + "gmod": 0, + "from_location": 0, + "to_location": 0, + "reactants": 0, + "products": 0, + "fusion_protein": 0, + "fusion_rna": 0, + "fusion_gene": 0, } EDGES_EXTENSION = { - 'bel_relation': 9, - 'causal': 3, - 'increases': 1, - 'directly_increases': 1, - 'decreases': 1, - 'directly_decreases': 0, - 'rate_limiting_step_of': 0, - 'causes_no_change': 0, - 'regulates': 0, - 'correlative': 0, - 'negative_correlation': 0, - 'positive_correlation': 0, - 'association': 0, - 'no_correlation': 0, - 'genomic': 6, - 'orthologous': 0, - 'transcribed_to': 3, - 'translated_to': 3, - 'other': 0, - 'has_member': 0, - 'has_members': 0, - 'has_component': 0, - 'has_components': 0, - 'equivalent_to': 0, - 'is_a': 0, - 'sub_process_of': 0, - 'deprecated': 0, - 'analogous_to': 0, - 'biomarker_for': 0, - 'prognostic_biomarker_for': 0, - 'compiler': 0, - 'acts_in': 0, - 'has_product': 0, - 'has_variant': 0, - 'has_modification': 0, - 'reactant_in': 0, - 'translocates': 0, - 'includes': 0, - 'ebel_relation': 1, - 'has__protein': 1, - 'has__rna': 0, - 'has__gene': 0, - 'has__abundance': 0, - 'has__population': 0, - 'has__location': 0, - 'has__from_location': 0, - 'has__to_location': 0, - 'has__fragment': 0, - 'has__pmod': 0, - 'has__gmod': 0, - 'has__complex': 0, - 'has__micro_rna': 0, - 'has__variant': 0, - 'has__reactants': 0, - 'has__products': 0, - 'has__composite': 0, - 'has_fragmented_protein': 0, - 'has_modified': 0, - 'has_modified_protein': 0, - 'has_modified_gene': 0, - 'has_variant_obj': 0, - 'has_variant_gene': 0, - 'has_variant_rna': 0, - 'has_variant_protein': 0, - 'has_variant_micro_rna': 0, - 'has_located': 0, - 'has_located_gene': 0, - 'has_located_rna': 0, - 'has_located_protein': 0, - 'has_located_micro_rna': 0, - 'has_located_complex': 0, - 'has_located_abundance': 0, - 'has_located_population': 0, - 'pathway_interaction': 0, - 'has_ppi': 0 + "bel_relation": 9, + "causal": 3, + "increases": 1, + "directly_increases": 1, + "decreases": 1, + "directly_decreases": 0, + "rate_limiting_step_of": 0, + "causes_no_change": 0, + "regulates": 0, + "correlative": 0, + "negative_correlation": 0, + "positive_correlation": 0, + "association": 0, + "no_correlation": 0, + "genomic": 6, + "orthologous": 0, + "transcribed_to": 3, + "translated_to": 3, + "other": 0, + "has_member": 0, + "has_members": 0, + "has_component": 0, + "has_components": 0, + "equivalent_to": 0, + "is_a": 0, + "sub_process_of": 0, + "deprecated": 0, + "analogous_to": 0, + "biomarker_for": 0, + "prognostic_biomarker_for": 0, + "compiler": 0, + "acts_in": 0, + "has_product": 0, + "has_variant": 0, + "has_modification": 0, + "reactant_in": 0, + "translocates": 0, + "includes": 0, + "ebel_relation": 1, + "has__protein": 1, + "has__rna": 0, + "has__gene": 0, + "has__abundance": 0, + "has__population": 0, + "has__location": 0, + "has__from_location": 0, + "has__to_location": 0, + "has__fragment": 0, + "has__pmod": 0, + "has__gmod": 0, + "has__complex": 0, + "has__micro_rna": 0, + "has__variant": 0, + "has__reactants": 0, + "has__products": 0, + "has__composite": 0, + "has_fragmented_protein": 0, + "has_modified": 0, + "has_modified_protein": 0, + "has_modified_gene": 0, + "has_variant_obj": 0, + "has_variant_gene": 0, + "has_variant_rna": 0, + "has_variant_protein": 0, + "has_variant_micro_rna": 0, + "has_located": 0, + "has_located_gene": 0, + "has_located_rna": 0, + "has_located_protein": 0, + "has_located_micro_rna": 0, + "has_located_complex": 0, + "has_located_abundance": 0, + "has_located_population": 0, + "pathway_interaction": 0, + "has_ppi": 0, } diff --git a/tox.ini b/tox.ini index 50a6005..c7a6505 100644 --- a/tox.ini +++ b/tox.ini @@ -3,11 +3,8 @@ envlist = # Delete previous coverage reports coverage-clean - # Check the MANIFEST to make sure all files are accounted for - manifest - - # flake8 checks that PEP8 style guidelines are followed - flake8 + # checks that PEP8 style guidelines are followed + lint # Make sure all the documentation files are in order docs @@ -20,11 +17,13 @@ envlist = commands = coverage run -p -m pytest deps= coverage + coverage[toml] pytest [testenv:coverage-clean] -deps = coverage -skip_install = true +deps = + coverage + coverage[toml] commands = coverage erase [testenv:coverage-report] @@ -39,19 +38,14 @@ deps = check-manifest skip_install = true commands = check-manifest -[testenv:flake8base] +[testenv:lint] skip_install = true deps = - flake8 - flake8-colors - flake8-docstrings -commands = - flake8 {posargs:ebel/ tests/ setup.py} - -[testenv:flake8] -deps = {[testenv:flake8base]deps} + black + isort commands = - flake8 ebel/ setup.py + black ebel/ tests/ + isort ebel/ tests/ [testenv:docs] description = Build docs site with Mkdocs @@ -80,25 +74,13 @@ skip_install = true deps = bumpversion -[testenv:build] -basepython = python3 -skip_install = true -deps = - wheel - setuptools -commands = - python setup.py -q sdist bdist_wheel - [testenv:release] basepython = python3 skip_install = true deps = - {[testenv:build]deps} - twine + poetry commands = - {[testenv:build]commands} - twine check dist/* - twine upload --skip-existing dist/* --verbose + poetry publish --build [testenv:finish] basepython = python3 @@ -107,7 +89,6 @@ whitelist_externals = /bin/git /usr/local/bin/git deps = - {[testenv:build]deps} {[testenv:release]deps} bumpversion commands =