diff --git a/.travis.yml b/.travis.yml index addc912..243a2d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,13 @@ +sudo: required language: python -python: - - "3.4" +services: + - docker # cd into the api project -before_install: cd api -install: pip install -r requirements.txt -# change from /api to /api/api -before_script: cd api -script: python manage.py test +before_install: + - make api +script: + - make test-api +after_success: + - docker login -e="$DOCKER_EMAIL" -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD" https://gcr.io + - docker tag hansard_api gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) + - docker push gcr.io/hansard-1012/hansard_api:$(git rev-parse HEAD) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c468171 --- /dev/null +++ b/Makefile @@ -0,0 +1,58 @@ +all: iepy api + +stop: docker-stop-api docker-stop-iepy + +# api ------------------ + +docker-build-api: + docker-compose -f docker-compose-api.yml build + +docker-up-api: docker-rm-api docker-build-api + docker-compose -f docker-compose-api.yml up -d + +docker-stop-api: + docker-compose -f docker-compose-api.yml stop + +docker-rm-api: docker-stop-api + docker-compose -f docker-compose-api.yml rm -f + +wait-for-postgres: docker-up-api + docker exec hansard_db_1 bash -c "while ! pg_isready; do echo \"$(date) - waiting for database to start\"; sleep 3; done" + +migrate-api: docker-up-api wait-for-postgres + docker exec hansard_api_1 python api/manage.py migrate + +createsuperuser-api: migrate-api + docker exec hansard_api_1 bash -c "echo \"from users.models import User; User.objects.create_superuser('admin', 'tim@rewire.it', 'sn0wb1rd')\" | python api/manage.py shell" + +init-data: migrate-api + docker exec -ti hansard_api_1 python api/manage.py init_data + +test-api: + docker exec -ti hansard_api_1 bash -c "cd api && python manage.py test" + +api: createsuperuser-api + echo "api running on localhost:8000" + +#-------------------- + +docker-build-iepy: + docker-compose -f docker-compose-iepy.yml build + +docker-up-iepy: docker-stop-iepy docker-build-iepy + docker-compose -f docker-compose-iepy.yml up -d + +docker-stop-iepy: + docker-compose -f docker-compose-iepy.yml stop + +docker-rm-iepy: docker-stop-iepy + docker-compose -f docker-compose-iepy.yml rm -f + +migrate-iepy: docker-up-iepy + docker exec hansard_iepy_1 python /hansard/bin/manage.py migrate + +createsuperuser-iepy: migrate-iepy + docker exec hansard_iepy_1 bash -c "echo \"from django.contrib.auth.models import User; User.objects.create_superuser('admin', 'tim@rewire.it', 'sn0wb1rd')\" | python /hansard/bin/manage.py shell" + +iepy: createsuperuser-iepy + echo "iepy running on localhost:8001" diff --git a/api/api/comments/__init__.py b/api/api/comments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/comments/admin.py b/api/api/comments/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/api/api/comments/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/api/comments/apps.py b/api/api/comments/apps.py new file mode 100644 index 0000000..ff01b77 --- /dev/null +++ b/api/api/comments/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class CommentsConfig(AppConfig): + name = 'comments' diff --git a/api/api/comments/migrations/__init__.py b/api/api/comments/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/comments/models.py b/api/api/comments/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/api/api/comments/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/api/api/comments/tests.py b/api/api/comments/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/api/api/comments/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/api/comments/views.py b/api/api/comments/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/api/api/comments/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/api/api/main/settings.py b/api/api/main/settings.py index 930e502..a00d24f 100644 --- a/api/api/main/settings.py +++ b/api/api/main/settings.py @@ -82,12 +82,23 @@ # Database # https://docs.djangoproject.com/en/1.9/ref/settings/#databases -DATABASES = { - 'default': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), +if os.environ.get('CONTAINERIZED'): + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'postgres', + 'USER': 'postgres', + 'HOST': 'db', + 'PORT': 5432, + } + } +else: + DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } } -} # Password validation diff --git a/api/api/topics/__init__.py b/api/api/topics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/topics/admin.py b/api/api/topics/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/api/api/topics/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/api/topics/apps.py b/api/api/topics/apps.py new file mode 100644 index 0000000..07e9bce --- /dev/null +++ b/api/api/topics/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class TopicsConfig(AppConfig): + name = 'topics' diff --git a/api/api/topics/migrations/__init__.py b/api/api/topics/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/api/topics/models.py b/api/api/topics/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/api/api/topics/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/api/api/topics/tests.py b/api/api/topics/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/api/api/topics/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/api/topics/views.py b/api/api/topics/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/api/api/topics/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/api/requirements.txt b/api/requirements.txt index f04a6af..419ca28 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -13,11 +13,16 @@ inflection==0.3.1 ipdb==0.9.0 ipython==4.1.2 ipython-genutils==0.1.0 +nltk==3.2 +numpy==1.11.0 oauthlib==1.0.3 path.py==8.1.2 pexpect==4.0.1 pickleshare==0.6 +psycopg2==2.6.1 ptyprocess==0.5.1 +requests==2.9.1 +sets==0.2.0 simplegeneric==0.8.1 six==1.10.0 traitlets==4.2.1 diff --git a/docker-compose-api.yml b/docker-compose-api.yml new file mode 100644 index 0000000..ad94a54 --- /dev/null +++ b/docker-compose-api.yml @@ -0,0 +1,16 @@ +db: + image: postgres:9.6 +api: + build: ./api + command: python api/manage.py runserver 0.0.0.0:8000 + ports: + - "8000:8000" + volumes: + - ./api:/app + links: + - db +# client: +# build: ./client +# command: ember server +# links: +# - api diff --git a/docker-compose-iepy.yml b/docker-compose-iepy.yml new file mode 100644 index 0000000..e67d94a --- /dev/null +++ b/docker-compose-iepy.yml @@ -0,0 +1,8 @@ +dbiepy: + image: postgres:9.6 +iepy: + build: ./iepy + ports: + - "8001:8001" + links: + - dbiepy diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 8cedb7e..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,20 +0,0 @@ -db: - image: postgres -search: - image: elasticsearch -api: - build: ./api - command: python api/manage.py runserver 0.0.0.0:8000 - ports: - - "8000:8000" - links: - - db - - search -# depends_on: -# - db -# - search -client: - build: ./client - command: ember server -# depends_on: -# - api diff --git a/iepy/Dockerfile b/iepy/Dockerfile new file mode 100644 index 0000000..1263dce --- /dev/null +++ b/iepy/Dockerfile @@ -0,0 +1,21 @@ +from python:3.4 + +RUN pip install numpy + +RUN pip install iepy + +RUN apt-get update && apt-get install -y openjdk-7-jre + +ENV JAVAHOME=/usr/bin/java + +RUN mkdir /root/.config + +RUN iepy --download-third-party-data + +COPY ./hansard /hansard + +RUN pip install psycopg2 + +EXPOSE 8001 + +CMD python /hansard/bin/manage.py runserver 0.0.0.0:8001 diff --git a/iepy/docker-compose.yml b/iepy/docker-compose.yml new file mode 100644 index 0000000..5f9a4f1 --- /dev/null +++ b/iepy/docker-compose.yml @@ -0,0 +1,10 @@ +db: + image: postgres +iepy: + build: . + ports: + - "8000:8000" + volumes: + - ./hansard:/hansard + links: + - db diff --git a/iepy/hansard/__init__.py b/iepy/hansard/__init__.py new file mode 100644 index 0000000..5c1a7f9 --- /dev/null +++ b/iepy/hansard/__init__.py @@ -0,0 +1 @@ +from . import rules \ No newline at end of file diff --git a/iepy/hansard/bin/csv_to_iepy.py b/iepy/hansard/bin/csv_to_iepy.py new file mode 100644 index 0000000..002e5d1 --- /dev/null +++ b/iepy/hansard/bin/csv_to_iepy.py @@ -0,0 +1,28 @@ +""" +IEPY database loader from csv file + +Usage: + csv_to_iepy.py + csv_to_iepy.py -h | --help + +The argument can be a .csv file or a .csv.gz file containing the +corpus in two columns: 'freebase_mid' and 'description'. + +Options: + -h --help Show this screen + --version Version number +""" + +import logging + +from docopt import docopt + +import iepy +iepy.setup(__file__) +from iepy.utils import csv_to_iepy + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(message)s') + opts = docopt(__doc__, version=iepy.__version__) + filepath = opts[""] + csv_to_iepy(filepath) diff --git a/iepy/hansard/bin/gazettes_loader.py b/iepy/hansard/bin/gazettes_loader.py new file mode 100644 index 0000000..26322d1 --- /dev/null +++ b/iepy/hansard/bin/gazettes_loader.py @@ -0,0 +1,76 @@ +""" +IEPY gazettes loader + +Usage: + gazettes_loader.py + + +The argument can be a .csv file or a .csv.gz file containing the +gazettes in two columns: 'literal' and 'class'. + + +Options: + -h --help Show this screen +""" + +import sys +import csv +import gzip +import logging +from operator import itemgetter + +from django.db import IntegrityError +from docopt import docopt + +import iepy +iepy.setup(__file__) +from iepy.data.models import EntityKind, GazetteItem + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def add_gazettes_from_csv(filepath): + if filepath.endswith(".gz"): + fin = gzip.open(filepath, "rt") + else: + fin = open(filepath, "rt") + reader = csv.DictReader(fin) + + expected_fnames = ['literal', 'class'] + if not set(reader.fieldnames).issuperset(expected_fnames): + msg = "Couldn't find the expected field names on the provided csv: {}" + sys.exit(msg.format(expected_fnames)) + + _create_gazette_entries( + itemgetter(*expected_fnames)(line) for line in reader + ) + + +def _create_gazette_entries(entries_list): + kind_cache = {} + created = 0 + for literal, kind_name in entries_list: + literal = literal.strip() + kind_name = kind_name.strip() + kind = kind_cache.get(kind_name) + if kind is None: + kind, _ = EntityKind.objects.get_or_create(name=kind_name) + kind_cache[kind_name] = kind + gazette = GazetteItem(text=literal, kind=kind) + + try: + gazette.save() + except IntegrityError as error: + logging.warn( + "Gazette '{}' of class '{}' not loaded, literal already existed".format( + literal, kind_name)) + print(error) + finally: + created += 1 + print('Created {} new gazette items'.format(created)) + + +if __name__ == "__main__": + opts = docopt(__doc__, version=iepy.__version__) + fname = opts[""] + add_gazettes_from_csv(fname) diff --git a/iepy/hansard/bin/iepy_rules_runner.py b/iepy/hansard/bin/iepy_rules_runner.py new file mode 100644 index 0000000..5016ae4 --- /dev/null +++ b/iepy/hansard/bin/iepy_rules_runner.py @@ -0,0 +1,59 @@ +""" +Run IEPY rule-based extractor + +Usage: + iepy_rules_runner.py + iepy_rules_runner.py -h | --help | --version + +Picks from rules.py the relation to work with, and the rules definitions and +proceeds with the extraction. + +Options: + -h --help Show this screen + --version Version number +""" +import sys +import logging + +from django.core.exceptions import ObjectDoesNotExist + +import iepy +iepy.setup(__file__) + +from iepy.extraction.rules import load_rules +from iepy.extraction.rules_core import RuleBasedCore +from iepy.data import models, output +from iepy.data.db import CandidateEvidenceManager + + +def run_from_command_line(): + logging.basicConfig(level=logging.INFO, format='%(message)s') + + try: + relation_name = iepy.instance.rules.RELATION + except AttributeError: + logging.error("RELATION not defined in rules file") + sys.exit(1) + + try: + relation = models.Relation.objects.get(name=relation_name) + except ObjectDoesNotExist: + logging.error("Relation {!r} not found".format(relation_name)) + sys.exit(1) + + # Load rules + rules = load_rules() + + # Load evidences + evidences = CandidateEvidenceManager.candidates_for_relation(relation) + + # Run the pipeline + iextractor = RuleBasedCore(relation, rules) + iextractor.start() + iextractor.process() + predictions = iextractor.predict(evidences) + output.dump_output_loop(predictions) + + +if __name__ == u'__main__': + run_from_command_line() diff --git a/iepy/hansard/bin/iepy_runner.py b/iepy/hansard/bin/iepy_runner.py new file mode 100644 index 0000000..6f2ac1e --- /dev/null +++ b/iepy/hansard/bin/iepy_runner.py @@ -0,0 +1,181 @@ +""" +Run IEPY active-learning extractor + +Usage: + iepy_runner.py [options] + iepy_runner.py [options] --db-store + iepy_runner.py -h | --help | --version + +Options: + --store-extractor= Stores the trained classifier + --trained-extractor= Load an already trained extractor + --db-store Stores the predictions on the database + --no-questions Won't generate questions to answer. Will predict + as is. Should be used with --trained-extractor + --tune-for= Predictions tuning. Options are high-prec + or high-recall [default: high-prec] + --extractor-config= Sets the extractor config + --version Version number + -h --help Show this screen +""" + +import os +import json +import logging +from docopt import docopt +from sys import exit + +import iepy +INSTANCE_PATH = iepy.setup(__file__) + +from iepy.extraction.active_learning_core import ActiveLearningCore, HIPREC, HIREC +from iepy.data.db import CandidateEvidenceManager +from iepy.data.models import Relation +from iepy.extraction.terminal import TerminalAdministration +from iepy.data import output + + +def print_all_relations(): + print("All available relations:") + for relation in Relation.objects.all(): + print(" {}".format(relation)) + + +def load_labeled_evidences(relation, evidences): + CEM = CandidateEvidenceManager # shorcut + return CEM.labels_for(relation, evidences, CEM.conflict_resolution_newest_wins) + + +def _get_tuning_mode(opts): + if opts['--tune-for'] == 'high-prec': + tuning_mode = HIPREC + elif opts['--tune-for'] == 'high-recall': + tuning_mode = HIREC + else: + print ('Invalid tuning mode') + print (__doc__) + exit(1) + return tuning_mode + + +def _get_relation(opts): + relation_name = opts[''] + try: + relation = Relation.objects.get(name=relation_name) + except Relation.DoesNotExist: + print("Relation {!r} non existent".format(relation_name)) + print_all_relations() + exit(1) + return relation + + +def _load_extractor(opts, relation, labeled_evidences): + extractor_path = opts.get('--trained-extractor') + try: + iextractor = ActiveLearningCore.load(extractor_path, + labeled_evidences=labeled_evidences) + except ValueError: + print("Error: unable to load extractor, invalid file") + exit(1) + + if iextractor.relation != relation: + print('The loaded extractor is not for the requested relation' + ' but for relation {} instead'.format(iextractor.relation)) + exit(1) + print('Extractor successfully loaded') + return iextractor + + +def _construct_extractor(opts, relation, labeled_evidences, tuning_mode): + config_filepath = opts.get("--extractor-config") + if not config_filepath: + config_filepath = os.path.join(INSTANCE_PATH, "extractor_config.json") + + if not os.path.exists(config_filepath): + print("Error: extractor config does not exists, please create the " + "file extractor_config.json or use the --extractor-config") + exit(1) + + with open(config_filepath) as filehandler: + try: + extractor_config = json.load(filehandler) + except Exception as error: + print("Error: unable to load extractor config: {}".format(error)) + exit(1) + + iextractor = ActiveLearningCore( + relation, labeled_evidences, extractor_config, tradeoff=tuning_mode + ) + return iextractor + + +def run_from_command_line(): + opts = docopt(__doc__, version=iepy.__version__) + + logging.basicConfig(level=logging.INFO, format='%(message)s') + logging.getLogger("featureforge").setLevel(logging.WARN) + + tuning_mode = _get_tuning_mode(opts) + relation = _get_relation(opts) + + candidates = CandidateEvidenceManager.candidates_for_relation(relation) + labeled_evidences = load_labeled_evidences(relation, candidates) + + if opts.get('--trained-extractor'): + iextractor = _load_extractor(opts, relation, labeled_evidences) + was_ever_trained = True + opts["--no-questions"] = True + else: + iextractor = _construct_extractor(opts, relation, labeled_evidences, tuning_mode) + iextractor.start() + was_ever_trained = False + + if not opts.get("--no-questions", False): + questions_loop(iextractor, relation, was_ever_trained) + + # Predict and store output + predictions = iextractor.predict(candidates) # asking predictions for EVERYTHING + if not predictions: + print("Nothing was predicted") + exit(1) + + if opts.get("--db-store"): + output.dump_predictions_to_database(relation, predictions) + + output_file = opts.get("") + if output_file: + output.dump_runner_output_to_csv(predictions, output_file) + + classifier_output = opts.get("--store-extractor") + if classifier_output: + iextractor.save(classifier_output) + + +def questions_loop(iextractor, relation, was_ever_trained): + STOP = u'STOP' + term = TerminalAdministration( + relation, + extra_options=[(STOP, u'Stop execution')] + ) + while iextractor.questions: + questions = list(iextractor.questions) # copying the list + term.update_candidate_evidences_to_label(questions) + result = term() + i = 0 + for c, label_value in load_labeled_evidences(relation, questions).items(): + if label_value is not None: + iextractor.add_answer(c, label_value) + i += 1 + print ('Added %s new human labels to the extractor core' % i) + iextractor.process() + was_ever_trained = True + if result == STOP: + break + + if not was_ever_trained: + # It's needed to run some process before asking for predictions + iextractor.process() + + +if __name__ == u'__main__': + run_from_command_line() diff --git a/iepy/hansard/bin/manage.py b/iepy/hansard/bin/manage.py new file mode 100644 index 0000000..55f2fde --- /dev/null +++ b/iepy/hansard/bin/manage.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +import sys + +from django.core.management import execute_from_command_line + +import iepy +iepy.setup(__file__) + + +if __name__ == "__main__": + execute_from_command_line(sys.argv) diff --git a/iepy/hansard/bin/preprocess.py b/iepy/hansard/bin/preprocess.py new file mode 100644 index 0000000..125711d --- /dev/null +++ b/iepy/hansard/bin/preprocess.py @@ -0,0 +1,88 @@ +""" +Corpus preprocessing script + +Usage: + preprocess.py [options] + preprocess.py --split-in= --run-part= + preprocess.py --increment-ner + preprocess.py -h | --help | --version + +Options: + -h --help Show this screen + --multiple-cores= Number of cores (use all to use every processor) + --increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely. + --version Version number +""" +import logging + +from docopt import docopt + +import iepy +import multiprocessing +iepy.setup(__file__) +from iepy.data.db import DocumentManager +from iepy.preprocess.stanford_preprocess import StanfordPreprocess +from iepy.preprocess.pipeline import PreProcessPipeline, PreProcessSteps +from iepy.preprocess.segmenter import SyntacticSegmenterRunner + + +class ParallelDocManager(DocumentManager): + + def mines_of(self, qset, number_of_processors, my_id): + K = number_of_processors + N = my_id + clause = 'id %%%% %s = %s' % (K, N) + return qset.extra(where=[clause]) + +def start_preprocess(docs, increment_ner): + pipeline = PreProcessPipeline([ + StanfordPreprocess(increment_ner), + SyntacticSegmenterRunner(increment=True) + ], docs) + pipeline.process_everything() + +if __name__ == '__main__': + logger = logging.getLogger(u'preprocess') + logger.setLevel(logging.INFO) + logging.basicConfig(level=logging.INFO, format='%(message)s') + opts = docopt(__doc__, version=iepy.__version__) + increment_ner = opts['--increment-ner'] + + dm = ParallelDocManager() + all_docs = dm.get_documents_lacking_preprocess( + [PreProcessSteps.segmentation, PreProcessSteps.syntactic_parsing]) + + multiple_cores = opts.get('--multiple-cores') + split_in = opts.get("--split-in") + run_part = opts.get("--run-part") + + if multiple_cores: + if multiple_cores == "all": + multiple_cores = multiprocessing.cpu_count() + try: + multiple_cores = int(multiple_cores) + except ValueError: + logger.error("Invalid number of cores") + exit(1) + + for i in range(multiple_cores): + process = multiprocessing.Process( + target=start_preprocess, args=(dm.mines_of(all_docs, multiple_cores, i), increment_ner) + ) + process.start() + elif split_in: + try: + split_in = int(split_in) + run_part = int(run_part) - 1 + except ValueError: + logger.error("Invalid split") + exit(1) + + if run_part < 0 or run_part > split_in: + logger.error("Parts must be between 1 and {}".format(split_in)) + exit(1) + + docs = dm.mines_of(all_docs, split_in, run_part) + start_preprocess(docs, increment_ner) + else: + start_preprocess(all_docs, increment_ner) diff --git a/iepy/hansard/bin/rules_verifier.py b/iepy/hansard/bin/rules_verifier.py new file mode 100644 index 0000000..98fce50 --- /dev/null +++ b/iepy/hansard/bin/rules_verifier.py @@ -0,0 +1,149 @@ +""" +IEPY rules verifier + + +Usage: + rules_verifier.py [options] + +Options: + --shuffle Chooses the sample randomly and not the first ones + --create-evidences Creates evidences that are missing [default: false] + -r --rule= Tests only this rule + -l --limit= Limits the amount of evidences uses + -h --help Show this screen +""" + +import sys +import logging +from docopt import docopt + +import refo +from django.core.exceptions import ObjectDoesNotExist +from colorama import init as colorama_init + +import iepy +iepy.setup(__file__) + +from iepy.data import models +from iepy.data.models import EvidenceCandidate +from iepy.data.db import CandidateEvidenceManager +from iepy.extraction.terminal import TerminalEvidenceFormatter +from iepy.extraction.rules import ( + load_rules, compile_rule, generate_tokens_to_match +) +from iepy.metrics import result_dict_from_predictions + + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def run_from_command_line(): + opts = docopt(__doc__, version=iepy.__version__) + relation_name = opts.get("") + limit = opts.get("--limit") + rule_name = opts.get("--rule") + shuffle = opts.get("--shuffle") + create_evidences = opts.get("--create-evidences") + + if limit is None: + limit = -1 + + try: + limit = int(limit) + except ValueError: + logging.error("Invalid limit value, it must be a number") + sys.exit(1) + + try: + relation = models.Relation.objects.get(name=relation_name) + except ObjectDoesNotExist: + logging.error("Relation {!r} not found".format(relation_name)) + sys.exit(1) + + # Load rules + rules = get_rules(rule_name) + rule_regexes = [ + (rule.__name__, compile_rule(rule, relation), rule.answer) for rule in rules + ] + + # Load evidences + if EvidenceCandidate.objects.all().count() == 0: + create_evidences = True + evidences = CandidateEvidenceManager.candidates_for_relation( + relation, create_evidences, seg_limit=limit, shuffle_segs=shuffle + ) + conflict_solver = CandidateEvidenceManager.conflict_resolution_newest_wins + answers = CandidateEvidenceManager.labels_for( + relation, evidences, conflict_solver + ) + run_tests(rule_regexes, evidences, answers) + + +def run_tests(rule_regexes, evidences, answers): + predictions = [] + real_labels = [] + evidences_with_labels = [] + + colorama_init() + formatter = TerminalEvidenceFormatter() + + for name, regex, answer in rule_regexes: + title = "Matches for rule '{}' (value: {})".format(name, answer) + print("\n{}\n{}".format(title, "-" * len(title))) + + anything_matched = False + for evidence in evidences: + tokens_to_match = generate_tokens_to_match(evidence) + match = refo.match(regex, tokens_to_match) + + if match: + anything_matched = True + print(" * {}".format(formatter.colored_text(evidence))) + + if evidence in answers and answers[evidence] is not None: + evidences_with_labels.append(evidence) + real_labels.append(answers[evidence]) + + if match: + predictions.append(answer) + else: + predictions.append(False) + + if not anything_matched: + print(" nothing matched") + + print() + + if real_labels: + results = result_dict_from_predictions( + evidences_with_labels, real_labels, predictions + ) + results.pop("end_time") + keys = [ + "true_positives", "true_negatives", + "false_positives", "false_negatives", + "precision", "recall", + "accuracy", "f1", + ] + + title = "Metrics" + print("{}\n{}".format(title, "-" * len(title))) + for key in keys: + print("{:>15}: {:.2f}".format(key, results[key])) + + +def get_rules(rule_name): + # Load rules + rules = load_rules() + + if rule_name: + rules = [x for x in rules if x.__name__ == rule_name] + if not rules: + logging.error("rule '{}' does not exists".format(rule_name)) + sys.exit(1) + + return rules + + +if __name__ == "__main__": + run_from_command_line() diff --git a/iepy/hansard/extractor_config.json b/iepy/hansard/extractor_config.json new file mode 100644 index 0000000..2f406a7 --- /dev/null +++ b/iepy/hansard/extractor_config.json @@ -0,0 +1,20 @@ +{ + "classifier_args": {}, + "classifier": "svc", + "sparse_features": [ + "bag_of_words", + "bag_of_pos", + "bag_of_words_in_between", + "bag_of_pos_in_between" + ], + "dense_features": [ + "entity_order", + "entity_distance", + "other_entities_in_between", + "verbs_count_in_between", + "verbs_count", + "total_number_of_entities", + "symbols_in_between", + "number_of_tokens" + ] +} \ No newline at end of file diff --git a/iepy/hansard/rules.py b/iepy/hansard/rules.py new file mode 100644 index 0000000..8f5699f --- /dev/null +++ b/iepy/hansard/rules.py @@ -0,0 +1,2 @@ +# Write here your rules +# RELATION = 'your relation here' diff --git a/iepy/hansard/settings.py b/iepy/hansard/settings.py new file mode 100644 index 0000000..33ed886 --- /dev/null +++ b/iepy/hansard/settings.py @@ -0,0 +1,35 @@ +""" +For more information on this file, see +https://docs.djangoproject.com/en/1.7/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/1.7/ref/settings/ +""" + +from iepy.webui.webui.settings import * + +IEPY_VERSION = '0.9.5' +IEPY_LANG = 'en' +SECRET_KEY = 'ckje0v6lbcg!ru2nd4uqoxa_ijme84m71x$^7t&7#y73gal^+5' +DEBUG = True +TEMPLATE_DEBUG = True + +# Database +# https://docs.djangoproject.com/en/1.7/ref/settings/#databases +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql_psycopg2', + 'NAME': 'postgres', + 'USER': 'postgres', + 'HOST': 'dbiepy', + 'PORT': 5432, + } +} + +# For changing tokenization options, read here. +# http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/process/PTBTokenizer.html +# You can use as key any of the "known options" listed on that page, and as value, +# use True or False (python names) for booleans, or strings when option requires a text +# CORENLP_TKN_OPTS = { +# 'latexQuotes': False +# }