From df3d518fc63fed621292712aedbfe6aa93397b12 Mon Sep 17 00:00:00 2001 From: NikolaLazarovOntotext <53425547+NikolaLazarovOntotext@users.noreply.github.com> Date: Mon, 29 Jul 2019 11:14:42 +0300 Subject: [PATCH 1/4] Add files via upload Solving device change problem (from GPU to CPU) --- runner.py | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 runner.py diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..402ca46 --- /dev/null +++ b/runner.py @@ -0,0 +1,414 @@ +import copy +import logging +import sys +import time +import warnings +from collections import OrderedDict + +import pandas as pd + +import pyprind +import torch +from tqdm import tqdm + +from .data import MatchingIterator +from .optim import Optimizer, SoftNLLLoss +from .utils import tally_parameters + +try: + get_ipython + from tqdm import tqdm_notebook as tqdm +except NameError: + from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +class Statistics(object): + """Accumulator for loss statistics, inspired by ONMT. + + Keeps track of the following metrics: + * F1 + * Precision + * Recall + * Accuracy + """ + + def __init__(self): + self.loss_sum = 0 + self.examples = 0 + self.tps = 0 + self.tns = 0 + self.fps = 0 + self.fns = 0 + self.start_time = time.time() + + def update(self, loss=0, tps=0, tns=0, fps=0, fns=0): + examples = tps + tns + fps + fns + self.loss_sum += loss * examples + self.tps += tps + self.tns += tns + self.fps += fps + self.fns += fns + self.examples += examples + + def loss(self): + return self.loss_sum / self.examples + + def f1(self): + prec = self.precision() + recall = self.recall() + return 2 * prec * recall / max(prec + recall, 1) + + def precision(self): + return 100 * self.tps / max(self.tps + self.fps, 1) + + def recall(self): + return 100 * self.tps / max(self.tps + self.fns, 1) + + def accuracy(self): + return 100 * (self.tps + self.tns) / self.examples + + def examples_per_sec(self): + return self.examples / (time.time() - self.start_time) + + +class Runner(object): + """Experiment runner. + + This class implements routines to train, evaluate and make predictions from models. + """ + + @staticmethod + def _print_stats(name, epoch, batch, n_batches, stats, cum_stats): + """Write out batch statistics to stdout. + """ + print((' | {name} | [{epoch}][{batch:4d}/{n_batches}] || Loss: {loss:7.4f} |' + ' F1: {f1:7.2f} | Prec: {prec:7.2f} | Rec: {rec:7.2f} ||' + ' Cum. F1: {cf1:7.2f} | Cum. Prec: {cprec:7.2f} | Cum. Rec: {crec:7.2f} ||' + ' Ex/s: {eps:6.1f}').format( + name=name, + epoch=epoch, + batch=batch, + n_batches=n_batches, + loss=stats.loss(), + f1=stats.f1(), + prec=stats.precision(), + rec=stats.recall(), + cf1=cum_stats.f1(), + cprec=cum_stats.precision(), + crec=cum_stats.recall(), + eps=cum_stats.examples_per_sec())) + + @staticmethod + def _print_final_stats(epoch, runtime, datatime, stats): + """Write out epoch statistics to stdout. + """ + print(('Finished Epoch {epoch} || Run Time: {runtime:6.1f} | ' + 'Load Time: {datatime:6.1f} || F1: {f1:6.2f} | Prec: {prec:6.2f} | ' + 'Rec: {rec:6.2f} || Ex/s: {eps:6.2f}\n').format( + epoch=epoch, + runtime=runtime, + datatime=datatime, + f1=stats.f1(), + prec=stats.precision(), + rec=stats.recall(), + eps=stats.examples_per_sec())) + + @staticmethod + def _set_pbar_status(pbar, stats, cum_stats): + postfix_dict = OrderedDict([ + ('Loss', '{0:7.4f}'.format(stats.loss())), + ('F1', '{0:7.2f}'.format(stats.f1())), + ('Cum. F1', '{0:7.2f}'.format(cum_stats.f1())), + ('Ex/s', '{0:6.1f}'.format(cum_stats.examples_per_sec())), + ]) + pbar.set_postfix(ordered_dict=postfix_dict) + + @staticmethod + def _compute_scores(output, target): + predictions = output.max(1)[1].data + correct = (predictions == target.data).float() + incorrect = (1 - correct).float() + positives = (target.data == 1).float() + negatives = (target.data == 0).float() + + tp = torch.dot(correct, positives) + tn = torch.dot(correct, negatives) + fp = torch.dot(incorrect, negatives) + fn = torch.dot(incorrect, positives) + + return tp, tn, fp, fn + + @staticmethod + def _run(run_type, + model, + dataset, + criterion=None, + optimizer=None, + train=False, + device=None, + batch_size=32, + batch_callback=None, + epoch_callback=None, + progress_style='bar', + log_freq=5, + sort_in_buckets=None, + return_predictions=False, + **kwargs): + + sort_in_buckets = train + run_iter = MatchingIterator( + dataset, + model.meta, + train, + batch_size=batch_size, + device=device, + sort_in_buckets=sort_in_buckets) + + if device == -1: + model = model.cpu() + if criterion: + criterion = criterion.cpu() + elif torch.cuda.is_available(): + model = model.cuda() + if criterion: + criterion = criterion.cuda() + else: + raise ValueError('No GPU available.') + + if train: + model.train() + else: + model.eval() + + epoch = model.epoch + datatime = 0 + runtime = 0 + cum_stats = Statistics() + stats = Statistics() + predictions = [] + id_attr = model.meta.id_field + label_attr = model.meta.label_field + + if train and epoch == 0: + print('* Number of trainable parameters:', tally_parameters(model)) + + epoch_str = 'Epoch {0:d}'.format(epoch + 1) + print('===> ', run_type, epoch_str) + batch_end = time.time() + + # The tqdm-bar for Jupyter notebook is under development. + if progress_style == 'tqdm-bar': + pbar = tqdm( + total=len(run_iter) // log_freq, + bar_format='{l_bar}{bar}{postfix}', + file=sys.stdout) + + # Use the pyprind bar as the default progress bar. + if progress_style == 'bar': + pbar = pyprind.ProgBar(len(run_iter) // log_freq, bar_char='█', width=30) + + for batch_idx, batch in enumerate(run_iter): + batch_start = time.time() + datatime += batch_start - batch_end + + output = model(batch) + + # from torchviz import make_dot, make_dot_from_trace + # dot = make_dot(output.mean(), params=dict(model.named_parameters())) + # pdb.set_trace() + + loss = float('NaN') + if criterion: + loss = criterion(output, getattr(batch, label_attr)) + + if hasattr(batch, label_attr): + scores = Runner._compute_scores(output, getattr(batch, label_attr)) + else: + scores = [0] * 4 + + cum_stats.update(float(loss), *scores) + stats.update(float(loss), *scores) + + if return_predictions: + for idx, id in enumerate(getattr(batch, id_attr)): + predictions.append((id, float(output[idx, 1].exp()))) + + if (batch_idx + 1) % log_freq == 0: + if progress_style == 'log': + Runner._print_stats(run_type, epoch + 1, batch_idx + 1, len(run_iter), + stats, cum_stats) + elif progress_style == 'tqdm-bar': + pbar.update() + Runner._set_pbar_status(pbar, stats, cum_stats) + elif progress_style == 'bar': + pbar.update() + stats = Statistics() + + if train: + model.zero_grad() + loss.backward() + + if not optimizer.params: + optimizer.set_parameters(model.named_parameters()) + optimizer.step() + + batch_end = time.time() + runtime += batch_end - batch_start + + if progress_style == 'tqdm-bar': + pbar.close() + elif progress_style == 'bar': + sys.stderr.flush() + + Runner._print_final_stats(epoch + 1, runtime, datatime, cum_stats) + + if return_predictions: + return predictions + else: + return cum_stats.f1() + + @staticmethod + def train(model, + train_dataset, + validation_dataset, + best_save_path, + epochs=30, + criterion=None, + optimizer=None, + pos_neg_ratio=None, + pos_weight=None, + label_smoothing=0.05, + save_every_prefix=None, + save_every_freq=1, + **kwargs): + """run_train(model, train_dataset, validation_dataset, best_save_path,epochs=30, \ + criterion=None, optimizer=None, pos_neg_ratio=None, pos_weight=None, \ + label_smoothing=0.05, save_every_prefix=None, save_every_freq=None, \ + batch_size=32, device=None, progress_style='bar', log_freq=5, \ + sort_in_buckets=None) + + Train a :class:`deepmatcher.MatchingModel` using the specified training set. + Refer to :meth:`deepmatcher.MatchingModel.run_train` for details on + parameters. + + Returns: + float: The best F1 score obtained by the model on the validation dataset. + """ + + model.initialize(train_dataset) + + model._register_train_buffer('optimizer_state', None) + model._register_train_buffer('best_score', None) + model._register_train_buffer('epoch', None) + + if criterion is None: + if pos_weight is not None: + assert pos_weight < 2 + warnings.warn('"pos_weight" parameter is deprecated and will be removed ' + 'in a later release, please use "pos_neg_ratio" instead', + DeprecationWarning) + assert pos_neg_ratio is None + else: + if pos_neg_ratio is None: + pos_neg_ratio = 1 + else: + assert pos_neg_ratio > 0 + pos_weight = 2 * pos_neg_ratio / (1 + pos_neg_ratio) + + neg_weight = 2 - pos_weight + + criterion = SoftNLLLoss(label_smoothing, + torch.Tensor([neg_weight, pos_weight])) + + optimizer = optimizer or Optimizer() + if model.optimizer_state is not None: + model.optimizer.base_optimizer.load_state_dict(model.optimizer_state) + + if model.epoch is None: + epochs_range = range(epochs) + else: + epochs_range = range(model.epoch + 1, epochs) + + if model.best_score is None: + model.best_score = -1 + optimizer.last_acc = model.best_score + + for epoch in epochs_range: + model.epoch = epoch + Runner._run( + 'TRAIN', model, train_dataset, criterion, optimizer, train=True, **kwargs) + + score = Runner._run('EVAL', model, validation_dataset, train=False, **kwargs) + + optimizer.update_learning_rate(score, epoch + 1) + model.optimizer_state = optimizer.base_optimizer.state_dict() + + new_best_found = False + if score > model.best_score: + print('* Best F1:', score) + model.best_score = score + new_best_found = True + + if best_save_path and new_best_found: + print('Saving best model...') + model.save_state(best_save_path) + print('Done.') + + if save_every_prefix is not None and (epoch + 1) % save_every_freq == 0: + print('Saving epoch model...') + save_path = '{prefix}_ep{epoch}.pth'.format( + prefix=save_every_prefix, epoch=epoch + 1) + model.save_state(save_path) + print('Done.') + print('---------------------\n') + + print('Loading best model...') + model.load_state(best_save_path) + print('Training done.') + + return model.best_score + + def eval(model, dataset, **kwargs): + """eval(model, dataset, device=None, batch_size=32, progress_style='bar', log_freq=5, + sort_in_buckets=None) + + Evaluate a :class:`deepmatcher.MatchingModel` on the specified dataset. + Refer to :meth:`deepmatcher.MatchingModel.run_eval` for details on + parameters. + + Returns: + float: The F1 score obtained by the model on the dataset. + """ + return Runner._run('EVAL', model, dataset, **kwargs) + + def predict(model, dataset, output_attributes=False, **kwargs): + """predict(model, dataset, output_attributes=False, device=None, batch_size=32, \ + progress_style='bar', log_freq=5, sort_in_buckets=None) + + Use a :class:`deepmatcher.MatchingModel` to obtain predictions, i.e., match scores + on the specified dataset. + + Returns: + pandas.DataFrame: A pandas DataFrame containing tuple pair IDs (in the "id" + column) and the corresponding match score predictions (in the + "match_score" column). Will also include all attributes in the original + CSV file of the dataset if `output_attributes` is True. + """ + # Create a shallow copy of the model and reset embeddings to use vocab and + # embeddings from new dataset. + model = copy.deepcopy(model) + model._reset_embeddings(dataset.vocabs) + + predictions = Runner._run( + 'PREDICT', model, dataset, return_predictions=True, **kwargs) + pred_table = pd.DataFrame(predictions, columns=(dataset.id_field, 'match_score')) + pred_table = pred_table.set_index(dataset.id_field) + + if output_attributes: + raw_table = pd.read_csv(dataset.path).set_index(dataset.id_field) + raw_table.index = raw_table.index.astype('str') + pred_table = pred_table.join(raw_table) + + return pred_table From 527065bf5189b800f1a3a33ed92d0db487819efd Mon Sep 17 00:00:00 2001 From: NikolaLazarovOntotext <53425547+NikolaLazarovOntotext@users.noreply.github.com> Date: Mon, 29 Jul 2019 11:16:11 +0300 Subject: [PATCH 2/4] Delete runner.py --- runner.py | 414 ------------------------------------------------------ 1 file changed, 414 deletions(-) delete mode 100644 runner.py diff --git a/runner.py b/runner.py deleted file mode 100644 index 402ca46..0000000 --- a/runner.py +++ /dev/null @@ -1,414 +0,0 @@ -import copy -import logging -import sys -import time -import warnings -from collections import OrderedDict - -import pandas as pd - -import pyprind -import torch -from tqdm import tqdm - -from .data import MatchingIterator -from .optim import Optimizer, SoftNLLLoss -from .utils import tally_parameters - -try: - get_ipython - from tqdm import tqdm_notebook as tqdm -except NameError: - from tqdm import tqdm - -logger = logging.getLogger(__name__) - - -class Statistics(object): - """Accumulator for loss statistics, inspired by ONMT. - - Keeps track of the following metrics: - * F1 - * Precision - * Recall - * Accuracy - """ - - def __init__(self): - self.loss_sum = 0 - self.examples = 0 - self.tps = 0 - self.tns = 0 - self.fps = 0 - self.fns = 0 - self.start_time = time.time() - - def update(self, loss=0, tps=0, tns=0, fps=0, fns=0): - examples = tps + tns + fps + fns - self.loss_sum += loss * examples - self.tps += tps - self.tns += tns - self.fps += fps - self.fns += fns - self.examples += examples - - def loss(self): - return self.loss_sum / self.examples - - def f1(self): - prec = self.precision() - recall = self.recall() - return 2 * prec * recall / max(prec + recall, 1) - - def precision(self): - return 100 * self.tps / max(self.tps + self.fps, 1) - - def recall(self): - return 100 * self.tps / max(self.tps + self.fns, 1) - - def accuracy(self): - return 100 * (self.tps + self.tns) / self.examples - - def examples_per_sec(self): - return self.examples / (time.time() - self.start_time) - - -class Runner(object): - """Experiment runner. - - This class implements routines to train, evaluate and make predictions from models. - """ - - @staticmethod - def _print_stats(name, epoch, batch, n_batches, stats, cum_stats): - """Write out batch statistics to stdout. - """ - print((' | {name} | [{epoch}][{batch:4d}/{n_batches}] || Loss: {loss:7.4f} |' - ' F1: {f1:7.2f} | Prec: {prec:7.2f} | Rec: {rec:7.2f} ||' - ' Cum. F1: {cf1:7.2f} | Cum. Prec: {cprec:7.2f} | Cum. Rec: {crec:7.2f} ||' - ' Ex/s: {eps:6.1f}').format( - name=name, - epoch=epoch, - batch=batch, - n_batches=n_batches, - loss=stats.loss(), - f1=stats.f1(), - prec=stats.precision(), - rec=stats.recall(), - cf1=cum_stats.f1(), - cprec=cum_stats.precision(), - crec=cum_stats.recall(), - eps=cum_stats.examples_per_sec())) - - @staticmethod - def _print_final_stats(epoch, runtime, datatime, stats): - """Write out epoch statistics to stdout. - """ - print(('Finished Epoch {epoch} || Run Time: {runtime:6.1f} | ' - 'Load Time: {datatime:6.1f} || F1: {f1:6.2f} | Prec: {prec:6.2f} | ' - 'Rec: {rec:6.2f} || Ex/s: {eps:6.2f}\n').format( - epoch=epoch, - runtime=runtime, - datatime=datatime, - f1=stats.f1(), - prec=stats.precision(), - rec=stats.recall(), - eps=stats.examples_per_sec())) - - @staticmethod - def _set_pbar_status(pbar, stats, cum_stats): - postfix_dict = OrderedDict([ - ('Loss', '{0:7.4f}'.format(stats.loss())), - ('F1', '{0:7.2f}'.format(stats.f1())), - ('Cum. F1', '{0:7.2f}'.format(cum_stats.f1())), - ('Ex/s', '{0:6.1f}'.format(cum_stats.examples_per_sec())), - ]) - pbar.set_postfix(ordered_dict=postfix_dict) - - @staticmethod - def _compute_scores(output, target): - predictions = output.max(1)[1].data - correct = (predictions == target.data).float() - incorrect = (1 - correct).float() - positives = (target.data == 1).float() - negatives = (target.data == 0).float() - - tp = torch.dot(correct, positives) - tn = torch.dot(correct, negatives) - fp = torch.dot(incorrect, negatives) - fn = torch.dot(incorrect, positives) - - return tp, tn, fp, fn - - @staticmethod - def _run(run_type, - model, - dataset, - criterion=None, - optimizer=None, - train=False, - device=None, - batch_size=32, - batch_callback=None, - epoch_callback=None, - progress_style='bar', - log_freq=5, - sort_in_buckets=None, - return_predictions=False, - **kwargs): - - sort_in_buckets = train - run_iter = MatchingIterator( - dataset, - model.meta, - train, - batch_size=batch_size, - device=device, - sort_in_buckets=sort_in_buckets) - - if device == -1: - model = model.cpu() - if criterion: - criterion = criterion.cpu() - elif torch.cuda.is_available(): - model = model.cuda() - if criterion: - criterion = criterion.cuda() - else: - raise ValueError('No GPU available.') - - if train: - model.train() - else: - model.eval() - - epoch = model.epoch - datatime = 0 - runtime = 0 - cum_stats = Statistics() - stats = Statistics() - predictions = [] - id_attr = model.meta.id_field - label_attr = model.meta.label_field - - if train and epoch == 0: - print('* Number of trainable parameters:', tally_parameters(model)) - - epoch_str = 'Epoch {0:d}'.format(epoch + 1) - print('===> ', run_type, epoch_str) - batch_end = time.time() - - # The tqdm-bar for Jupyter notebook is under development. - if progress_style == 'tqdm-bar': - pbar = tqdm( - total=len(run_iter) // log_freq, - bar_format='{l_bar}{bar}{postfix}', - file=sys.stdout) - - # Use the pyprind bar as the default progress bar. - if progress_style == 'bar': - pbar = pyprind.ProgBar(len(run_iter) // log_freq, bar_char='█', width=30) - - for batch_idx, batch in enumerate(run_iter): - batch_start = time.time() - datatime += batch_start - batch_end - - output = model(batch) - - # from torchviz import make_dot, make_dot_from_trace - # dot = make_dot(output.mean(), params=dict(model.named_parameters())) - # pdb.set_trace() - - loss = float('NaN') - if criterion: - loss = criterion(output, getattr(batch, label_attr)) - - if hasattr(batch, label_attr): - scores = Runner._compute_scores(output, getattr(batch, label_attr)) - else: - scores = [0] * 4 - - cum_stats.update(float(loss), *scores) - stats.update(float(loss), *scores) - - if return_predictions: - for idx, id in enumerate(getattr(batch, id_attr)): - predictions.append((id, float(output[idx, 1].exp()))) - - if (batch_idx + 1) % log_freq == 0: - if progress_style == 'log': - Runner._print_stats(run_type, epoch + 1, batch_idx + 1, len(run_iter), - stats, cum_stats) - elif progress_style == 'tqdm-bar': - pbar.update() - Runner._set_pbar_status(pbar, stats, cum_stats) - elif progress_style == 'bar': - pbar.update() - stats = Statistics() - - if train: - model.zero_grad() - loss.backward() - - if not optimizer.params: - optimizer.set_parameters(model.named_parameters()) - optimizer.step() - - batch_end = time.time() - runtime += batch_end - batch_start - - if progress_style == 'tqdm-bar': - pbar.close() - elif progress_style == 'bar': - sys.stderr.flush() - - Runner._print_final_stats(epoch + 1, runtime, datatime, cum_stats) - - if return_predictions: - return predictions - else: - return cum_stats.f1() - - @staticmethod - def train(model, - train_dataset, - validation_dataset, - best_save_path, - epochs=30, - criterion=None, - optimizer=None, - pos_neg_ratio=None, - pos_weight=None, - label_smoothing=0.05, - save_every_prefix=None, - save_every_freq=1, - **kwargs): - """run_train(model, train_dataset, validation_dataset, best_save_path,epochs=30, \ - criterion=None, optimizer=None, pos_neg_ratio=None, pos_weight=None, \ - label_smoothing=0.05, save_every_prefix=None, save_every_freq=None, \ - batch_size=32, device=None, progress_style='bar', log_freq=5, \ - sort_in_buckets=None) - - Train a :class:`deepmatcher.MatchingModel` using the specified training set. - Refer to :meth:`deepmatcher.MatchingModel.run_train` for details on - parameters. - - Returns: - float: The best F1 score obtained by the model on the validation dataset. - """ - - model.initialize(train_dataset) - - model._register_train_buffer('optimizer_state', None) - model._register_train_buffer('best_score', None) - model._register_train_buffer('epoch', None) - - if criterion is None: - if pos_weight is not None: - assert pos_weight < 2 - warnings.warn('"pos_weight" parameter is deprecated and will be removed ' - 'in a later release, please use "pos_neg_ratio" instead', - DeprecationWarning) - assert pos_neg_ratio is None - else: - if pos_neg_ratio is None: - pos_neg_ratio = 1 - else: - assert pos_neg_ratio > 0 - pos_weight = 2 * pos_neg_ratio / (1 + pos_neg_ratio) - - neg_weight = 2 - pos_weight - - criterion = SoftNLLLoss(label_smoothing, - torch.Tensor([neg_weight, pos_weight])) - - optimizer = optimizer or Optimizer() - if model.optimizer_state is not None: - model.optimizer.base_optimizer.load_state_dict(model.optimizer_state) - - if model.epoch is None: - epochs_range = range(epochs) - else: - epochs_range = range(model.epoch + 1, epochs) - - if model.best_score is None: - model.best_score = -1 - optimizer.last_acc = model.best_score - - for epoch in epochs_range: - model.epoch = epoch - Runner._run( - 'TRAIN', model, train_dataset, criterion, optimizer, train=True, **kwargs) - - score = Runner._run('EVAL', model, validation_dataset, train=False, **kwargs) - - optimizer.update_learning_rate(score, epoch + 1) - model.optimizer_state = optimizer.base_optimizer.state_dict() - - new_best_found = False - if score > model.best_score: - print('* Best F1:', score) - model.best_score = score - new_best_found = True - - if best_save_path and new_best_found: - print('Saving best model...') - model.save_state(best_save_path) - print('Done.') - - if save_every_prefix is not None and (epoch + 1) % save_every_freq == 0: - print('Saving epoch model...') - save_path = '{prefix}_ep{epoch}.pth'.format( - prefix=save_every_prefix, epoch=epoch + 1) - model.save_state(save_path) - print('Done.') - print('---------------------\n') - - print('Loading best model...') - model.load_state(best_save_path) - print('Training done.') - - return model.best_score - - def eval(model, dataset, **kwargs): - """eval(model, dataset, device=None, batch_size=32, progress_style='bar', log_freq=5, - sort_in_buckets=None) - - Evaluate a :class:`deepmatcher.MatchingModel` on the specified dataset. - Refer to :meth:`deepmatcher.MatchingModel.run_eval` for details on - parameters. - - Returns: - float: The F1 score obtained by the model on the dataset. - """ - return Runner._run('EVAL', model, dataset, **kwargs) - - def predict(model, dataset, output_attributes=False, **kwargs): - """predict(model, dataset, output_attributes=False, device=None, batch_size=32, \ - progress_style='bar', log_freq=5, sort_in_buckets=None) - - Use a :class:`deepmatcher.MatchingModel` to obtain predictions, i.e., match scores - on the specified dataset. - - Returns: - pandas.DataFrame: A pandas DataFrame containing tuple pair IDs (in the "id" - column) and the corresponding match score predictions (in the - "match_score" column). Will also include all attributes in the original - CSV file of the dataset if `output_attributes` is True. - """ - # Create a shallow copy of the model and reset embeddings to use vocab and - # embeddings from new dataset. - model = copy.deepcopy(model) - model._reset_embeddings(dataset.vocabs) - - predictions = Runner._run( - 'PREDICT', model, dataset, return_predictions=True, **kwargs) - pred_table = pd.DataFrame(predictions, columns=(dataset.id_field, 'match_score')) - pred_table = pred_table.set_index(dataset.id_field) - - if output_attributes: - raw_table = pd.read_csv(dataset.path).set_index(dataset.id_field) - raw_table.index = raw_table.index.astype('str') - pred_table = pred_table.join(raw_table) - - return pred_table From f08b829532418e05eafc8bf76dcef2965a49d514 Mon Sep 17 00:00:00 2001 From: NikolaLazarovOntotext <53425547+NikolaLazarovOntotext@users.noreply.github.com> Date: Mon, 29 Jul 2019 11:17:11 +0300 Subject: [PATCH 3/4] Delete runner.py --- deepmatcher/runner.py | 414 ------------------------------------------ 1 file changed, 414 deletions(-) delete mode 100644 deepmatcher/runner.py diff --git a/deepmatcher/runner.py b/deepmatcher/runner.py deleted file mode 100644 index e92e6ab..0000000 --- a/deepmatcher/runner.py +++ /dev/null @@ -1,414 +0,0 @@ -import copy -import logging -import sys -import time -import warnings -from collections import OrderedDict - -import pandas as pd - -import pyprind -import torch -from tqdm import tqdm - -from .data import MatchingIterator -from .optim import Optimizer, SoftNLLLoss -from .utils import tally_parameters - -try: - get_ipython - from tqdm import tqdm_notebook as tqdm -except NameError: - from tqdm import tqdm - -logger = logging.getLogger(__name__) - - -class Statistics(object): - """Accumulator for loss statistics, inspired by ONMT. - - Keeps track of the following metrics: - * F1 - * Precision - * Recall - * Accuracy - """ - - def __init__(self): - self.loss_sum = 0 - self.examples = 0 - self.tps = 0 - self.tns = 0 - self.fps = 0 - self.fns = 0 - self.start_time = time.time() - - def update(self, loss=0, tps=0, tns=0, fps=0, fns=0): - examples = tps + tns + fps + fns - self.loss_sum += loss * examples - self.tps += tps - self.tns += tns - self.fps += fps - self.fns += fns - self.examples += examples - - def loss(self): - return self.loss_sum / self.examples - - def f1(self): - prec = self.precision() - recall = self.recall() - return 2 * prec * recall / max(prec + recall, 1) - - def precision(self): - return 100 * self.tps / max(self.tps + self.fps, 1) - - def recall(self): - return 100 * self.tps / max(self.tps + self.fns, 1) - - def accuracy(self): - return 100 * (self.tps + self.tns) / self.examples - - def examples_per_sec(self): - return self.examples / (time.time() - self.start_time) - - -class Runner(object): - """Experiment runner. - - This class implements routines to train, evaluate and make predictions from models. - """ - - @staticmethod - def _print_stats(name, epoch, batch, n_batches, stats, cum_stats): - """Write out batch statistics to stdout. - """ - print((' | {name} | [{epoch}][{batch:4d}/{n_batches}] || Loss: {loss:7.4f} |' - ' F1: {f1:7.2f} | Prec: {prec:7.2f} | Rec: {rec:7.2f} ||' - ' Cum. F1: {cf1:7.2f} | Cum. Prec: {cprec:7.2f} | Cum. Rec: {crec:7.2f} ||' - ' Ex/s: {eps:6.1f}').format( - name=name, - epoch=epoch, - batch=batch, - n_batches=n_batches, - loss=stats.loss(), - f1=stats.f1(), - prec=stats.precision(), - rec=stats.recall(), - cf1=cum_stats.f1(), - cprec=cum_stats.precision(), - crec=cum_stats.recall(), - eps=cum_stats.examples_per_sec())) - - @staticmethod - def _print_final_stats(epoch, runtime, datatime, stats): - """Write out epoch statistics to stdout. - """ - print(('Finished Epoch {epoch} || Run Time: {runtime:6.1f} | ' - 'Load Time: {datatime:6.1f} || F1: {f1:6.2f} | Prec: {prec:6.2f} | ' - 'Rec: {rec:6.2f} || Ex/s: {eps:6.2f}\n').format( - epoch=epoch, - runtime=runtime, - datatime=datatime, - f1=stats.f1(), - prec=stats.precision(), - rec=stats.recall(), - eps=stats.examples_per_sec())) - - @staticmethod - def _set_pbar_status(pbar, stats, cum_stats): - postfix_dict = OrderedDict([ - ('Loss', '{0:7.4f}'.format(stats.loss())), - ('F1', '{0:7.2f}'.format(stats.f1())), - ('Cum. F1', '{0:7.2f}'.format(cum_stats.f1())), - ('Ex/s', '{0:6.1f}'.format(cum_stats.examples_per_sec())), - ]) - pbar.set_postfix(ordered_dict=postfix_dict) - - @staticmethod - def _compute_scores(output, target): - predictions = output.max(1)[1].data - correct = (predictions == target.data).float() - incorrect = (1 - correct).float() - positives = (target.data == 1).float() - negatives = (target.data == 0).float() - - tp = torch.dot(correct, positives) - tn = torch.dot(correct, negatives) - fp = torch.dot(incorrect, negatives) - fn = torch.dot(incorrect, positives) - - return tp, tn, fp, fn - - @staticmethod - def _run(run_type, - model, - dataset, - criterion=None, - optimizer=None, - train=False, - device=None, - batch_size=32, - batch_callback=None, - epoch_callback=None, - progress_style='bar', - log_freq=5, - sort_in_buckets=None, - return_predictions=False, - **kwargs): - - sort_in_buckets = train - run_iter = MatchingIterator( - dataset, - model.meta, - train, - batch_size=batch_size, - device=device, - sort_in_buckets=sort_in_buckets) - - if device == 'cpu': - model = model.cpu() - if criterion: - criterion = criterion.cpu() - elif torch.cuda.is_available(): - model = model.cuda() - if criterion: - criterion = criterion.cuda() - elif device == 'gpu': - raise ValueError('No GPU available.') - - if train: - model.train() - else: - model.eval() - - epoch = model.epoch - datatime = 0 - runtime = 0 - cum_stats = Statistics() - stats = Statistics() - predictions = [] - id_attr = model.meta.id_field - label_attr = model.meta.label_field - - if train and epoch == 0: - print('* Number of trainable parameters:', tally_parameters(model)) - - epoch_str = 'Epoch {0:d}'.format(epoch + 1) - print('===> ', run_type, epoch_str) - batch_end = time.time() - - # The tqdm-bar for Jupyter notebook is under development. - if progress_style == 'tqdm-bar': - pbar = tqdm( - total=len(run_iter) // log_freq, - bar_format='{l_bar}{bar}{postfix}', - file=sys.stdout) - - # Use the pyprind bar as the default progress bar. - if progress_style == 'bar': - pbar = pyprind.ProgBar(len(run_iter) // log_freq, bar_char='█', width=30) - - for batch_idx, batch in enumerate(run_iter): - batch_start = time.time() - datatime += batch_start - batch_end - - output = model(batch) - - # from torchviz import make_dot, make_dot_from_trace - # dot = make_dot(output.mean(), params=dict(model.named_parameters())) - # pdb.set_trace() - - loss = float('NaN') - if criterion: - loss = criterion(output, getattr(batch, label_attr)) - - if hasattr(batch, label_attr): - scores = Runner._compute_scores(output, getattr(batch, label_attr)) - else: - scores = [0] * 4 - - cum_stats.update(float(loss), *scores) - stats.update(float(loss), *scores) - - if return_predictions: - for idx, id in enumerate(getattr(batch, id_attr)): - predictions.append((id, float(output[idx, 1].exp()))) - - if (batch_idx + 1) % log_freq == 0: - if progress_style == 'log': - Runner._print_stats(run_type, epoch + 1, batch_idx + 1, len(run_iter), - stats, cum_stats) - elif progress_style == 'tqdm-bar': - pbar.update() - Runner._set_pbar_status(pbar, stats, cum_stats) - elif progress_style == 'bar': - pbar.update() - stats = Statistics() - - if train: - model.zero_grad() - loss.backward() - - if not optimizer.params: - optimizer.set_parameters(model.named_parameters()) - optimizer.step() - - batch_end = time.time() - runtime += batch_end - batch_start - - if progress_style == 'tqdm-bar': - pbar.close() - elif progress_style == 'bar': - sys.stderr.flush() - - Runner._print_final_stats(epoch + 1, runtime, datatime, cum_stats) - - if return_predictions: - return predictions - else: - return cum_stats.f1() - - @staticmethod - def train(model, - train_dataset, - validation_dataset, - best_save_path, - epochs=30, - criterion=None, - optimizer=None, - pos_neg_ratio=None, - pos_weight=None, - label_smoothing=0.05, - save_every_prefix=None, - save_every_freq=1, - **kwargs): - """run_train(model, train_dataset, validation_dataset, best_save_path,epochs=30, \ - criterion=None, optimizer=None, pos_neg_ratio=None, pos_weight=None, \ - label_smoothing=0.05, save_every_prefix=None, save_every_freq=None, \ - batch_size=32, device=None, progress_style='bar', log_freq=5, \ - sort_in_buckets=None) - - Train a :class:`deepmatcher.MatchingModel` using the specified training set. - Refer to :meth:`deepmatcher.MatchingModel.run_train` for details on - parameters. - - Returns: - float: The best F1 score obtained by the model on the validation dataset. - """ - - model.initialize(train_dataset) - - model._register_train_buffer('optimizer_state', None) - model._register_train_buffer('best_score', None) - model._register_train_buffer('epoch', None) - - if criterion is None: - if pos_weight is not None: - assert pos_weight < 2 - warnings.warn('"pos_weight" parameter is deprecated and will be removed ' - 'in a later release, please use "pos_neg_ratio" instead', - DeprecationWarning) - assert pos_neg_ratio is None - else: - if pos_neg_ratio is None: - pos_neg_ratio = 1 - else: - assert pos_neg_ratio > 0 - pos_weight = 2 * pos_neg_ratio / (1 + pos_neg_ratio) - - neg_weight = 2 - pos_weight - - criterion = SoftNLLLoss(label_smoothing, - torch.Tensor([neg_weight, pos_weight])) - - optimizer = optimizer or Optimizer() - if model.optimizer_state is not None: - model.optimizer.base_optimizer.load_state_dict(model.optimizer_state) - - if model.epoch is None: - epochs_range = range(epochs) - else: - epochs_range = range(model.epoch + 1, epochs) - - if model.best_score is None: - model.best_score = -1 - optimizer.last_acc = model.best_score - - for epoch in epochs_range: - model.epoch = epoch - Runner._run( - 'TRAIN', model, train_dataset, criterion, optimizer, train=True, **kwargs) - - score = Runner._run('EVAL', model, validation_dataset, train=False, **kwargs) - - optimizer.update_learning_rate(score, epoch + 1) - model.optimizer_state = optimizer.base_optimizer.state_dict() - - new_best_found = False - if score > model.best_score: - print('* Best F1:', score) - model.best_score = score - new_best_found = True - - if best_save_path and new_best_found: - print('Saving best model...') - model.save_state(best_save_path) - print('Done.') - - if save_every_prefix is not None and (epoch + 1) % save_every_freq == 0: - print('Saving epoch model...') - save_path = '{prefix}_ep{epoch}.pth'.format( - prefix=save_every_prefix, epoch=epoch + 1) - model.save_state(save_path) - print('Done.') - print('---------------------\n') - - print('Loading best model...') - model.load_state(best_save_path) - print('Training done.') - - return model.best_score - - def eval(model, dataset, **kwargs): - """eval(model, dataset, device=None, batch_size=32, progress_style='bar', log_freq=5, - sort_in_buckets=None) - - Evaluate a :class:`deepmatcher.MatchingModel` on the specified dataset. - Refer to :meth:`deepmatcher.MatchingModel.run_eval` for details on - parameters. - - Returns: - float: The F1 score obtained by the model on the dataset. - """ - return Runner._run('EVAL', model, dataset, **kwargs) - - def predict(model, dataset, output_attributes=False, **kwargs): - """predict(model, dataset, output_attributes=False, device=None, batch_size=32, \ - progress_style='bar', log_freq=5, sort_in_buckets=None) - - Use a :class:`deepmatcher.MatchingModel` to obtain predictions, i.e., match scores - on the specified dataset. - - Returns: - pandas.DataFrame: A pandas DataFrame containing tuple pair IDs (in the "id" - column) and the corresponding match score predictions (in the - "match_score" column). Will also include all attributes in the original - CSV file of the dataset if `output_attributes` is True. - """ - # Create a shallow copy of the model and reset embeddings to use vocab and - # embeddings from new dataset. - model = copy.deepcopy(model) - model._reset_embeddings(dataset.vocabs) - - predictions = Runner._run( - 'PREDICT', model, dataset, return_predictions=True, **kwargs) - pred_table = pd.DataFrame(predictions, columns=(dataset.id_field, 'match_score')) - pred_table = pred_table.set_index(dataset.id_field) - - if output_attributes: - raw_table = pd.read_csv(dataset.path).set_index(dataset.id_field) - raw_table.index = raw_table.index.astype('str') - pred_table = pred_table.join(raw_table) - - return pred_table From 91771ac0497f5bc19124718db748da5638f4272d Mon Sep 17 00:00:00 2001 From: NikolaLazarovOntotext <53425547+NikolaLazarovOntotext@users.noreply.github.com> Date: Mon, 29 Jul 2019 11:18:21 +0300 Subject: [PATCH 4/4] Add files via upload Solving the error that occurs when trying to change the device( from GPU to CPU) --- deepmatcher/runner.py | 414 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 deepmatcher/runner.py diff --git a/deepmatcher/runner.py b/deepmatcher/runner.py new file mode 100644 index 0000000..402ca46 --- /dev/null +++ b/deepmatcher/runner.py @@ -0,0 +1,414 @@ +import copy +import logging +import sys +import time +import warnings +from collections import OrderedDict + +import pandas as pd + +import pyprind +import torch +from tqdm import tqdm + +from .data import MatchingIterator +from .optim import Optimizer, SoftNLLLoss +from .utils import tally_parameters + +try: + get_ipython + from tqdm import tqdm_notebook as tqdm +except NameError: + from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +class Statistics(object): + """Accumulator for loss statistics, inspired by ONMT. + + Keeps track of the following metrics: + * F1 + * Precision + * Recall + * Accuracy + """ + + def __init__(self): + self.loss_sum = 0 + self.examples = 0 + self.tps = 0 + self.tns = 0 + self.fps = 0 + self.fns = 0 + self.start_time = time.time() + + def update(self, loss=0, tps=0, tns=0, fps=0, fns=0): + examples = tps + tns + fps + fns + self.loss_sum += loss * examples + self.tps += tps + self.tns += tns + self.fps += fps + self.fns += fns + self.examples += examples + + def loss(self): + return self.loss_sum / self.examples + + def f1(self): + prec = self.precision() + recall = self.recall() + return 2 * prec * recall / max(prec + recall, 1) + + def precision(self): + return 100 * self.tps / max(self.tps + self.fps, 1) + + def recall(self): + return 100 * self.tps / max(self.tps + self.fns, 1) + + def accuracy(self): + return 100 * (self.tps + self.tns) / self.examples + + def examples_per_sec(self): + return self.examples / (time.time() - self.start_time) + + +class Runner(object): + """Experiment runner. + + This class implements routines to train, evaluate and make predictions from models. + """ + + @staticmethod + def _print_stats(name, epoch, batch, n_batches, stats, cum_stats): + """Write out batch statistics to stdout. + """ + print((' | {name} | [{epoch}][{batch:4d}/{n_batches}] || Loss: {loss:7.4f} |' + ' F1: {f1:7.2f} | Prec: {prec:7.2f} | Rec: {rec:7.2f} ||' + ' Cum. F1: {cf1:7.2f} | Cum. Prec: {cprec:7.2f} | Cum. Rec: {crec:7.2f} ||' + ' Ex/s: {eps:6.1f}').format( + name=name, + epoch=epoch, + batch=batch, + n_batches=n_batches, + loss=stats.loss(), + f1=stats.f1(), + prec=stats.precision(), + rec=stats.recall(), + cf1=cum_stats.f1(), + cprec=cum_stats.precision(), + crec=cum_stats.recall(), + eps=cum_stats.examples_per_sec())) + + @staticmethod + def _print_final_stats(epoch, runtime, datatime, stats): + """Write out epoch statistics to stdout. + """ + print(('Finished Epoch {epoch} || Run Time: {runtime:6.1f} | ' + 'Load Time: {datatime:6.1f} || F1: {f1:6.2f} | Prec: {prec:6.2f} | ' + 'Rec: {rec:6.2f} || Ex/s: {eps:6.2f}\n').format( + epoch=epoch, + runtime=runtime, + datatime=datatime, + f1=stats.f1(), + prec=stats.precision(), + rec=stats.recall(), + eps=stats.examples_per_sec())) + + @staticmethod + def _set_pbar_status(pbar, stats, cum_stats): + postfix_dict = OrderedDict([ + ('Loss', '{0:7.4f}'.format(stats.loss())), + ('F1', '{0:7.2f}'.format(stats.f1())), + ('Cum. F1', '{0:7.2f}'.format(cum_stats.f1())), + ('Ex/s', '{0:6.1f}'.format(cum_stats.examples_per_sec())), + ]) + pbar.set_postfix(ordered_dict=postfix_dict) + + @staticmethod + def _compute_scores(output, target): + predictions = output.max(1)[1].data + correct = (predictions == target.data).float() + incorrect = (1 - correct).float() + positives = (target.data == 1).float() + negatives = (target.data == 0).float() + + tp = torch.dot(correct, positives) + tn = torch.dot(correct, negatives) + fp = torch.dot(incorrect, negatives) + fn = torch.dot(incorrect, positives) + + return tp, tn, fp, fn + + @staticmethod + def _run(run_type, + model, + dataset, + criterion=None, + optimizer=None, + train=False, + device=None, + batch_size=32, + batch_callback=None, + epoch_callback=None, + progress_style='bar', + log_freq=5, + sort_in_buckets=None, + return_predictions=False, + **kwargs): + + sort_in_buckets = train + run_iter = MatchingIterator( + dataset, + model.meta, + train, + batch_size=batch_size, + device=device, + sort_in_buckets=sort_in_buckets) + + if device == -1: + model = model.cpu() + if criterion: + criterion = criterion.cpu() + elif torch.cuda.is_available(): + model = model.cuda() + if criterion: + criterion = criterion.cuda() + else: + raise ValueError('No GPU available.') + + if train: + model.train() + else: + model.eval() + + epoch = model.epoch + datatime = 0 + runtime = 0 + cum_stats = Statistics() + stats = Statistics() + predictions = [] + id_attr = model.meta.id_field + label_attr = model.meta.label_field + + if train and epoch == 0: + print('* Number of trainable parameters:', tally_parameters(model)) + + epoch_str = 'Epoch {0:d}'.format(epoch + 1) + print('===> ', run_type, epoch_str) + batch_end = time.time() + + # The tqdm-bar for Jupyter notebook is under development. + if progress_style == 'tqdm-bar': + pbar = tqdm( + total=len(run_iter) // log_freq, + bar_format='{l_bar}{bar}{postfix}', + file=sys.stdout) + + # Use the pyprind bar as the default progress bar. + if progress_style == 'bar': + pbar = pyprind.ProgBar(len(run_iter) // log_freq, bar_char='█', width=30) + + for batch_idx, batch in enumerate(run_iter): + batch_start = time.time() + datatime += batch_start - batch_end + + output = model(batch) + + # from torchviz import make_dot, make_dot_from_trace + # dot = make_dot(output.mean(), params=dict(model.named_parameters())) + # pdb.set_trace() + + loss = float('NaN') + if criterion: + loss = criterion(output, getattr(batch, label_attr)) + + if hasattr(batch, label_attr): + scores = Runner._compute_scores(output, getattr(batch, label_attr)) + else: + scores = [0] * 4 + + cum_stats.update(float(loss), *scores) + stats.update(float(loss), *scores) + + if return_predictions: + for idx, id in enumerate(getattr(batch, id_attr)): + predictions.append((id, float(output[idx, 1].exp()))) + + if (batch_idx + 1) % log_freq == 0: + if progress_style == 'log': + Runner._print_stats(run_type, epoch + 1, batch_idx + 1, len(run_iter), + stats, cum_stats) + elif progress_style == 'tqdm-bar': + pbar.update() + Runner._set_pbar_status(pbar, stats, cum_stats) + elif progress_style == 'bar': + pbar.update() + stats = Statistics() + + if train: + model.zero_grad() + loss.backward() + + if not optimizer.params: + optimizer.set_parameters(model.named_parameters()) + optimizer.step() + + batch_end = time.time() + runtime += batch_end - batch_start + + if progress_style == 'tqdm-bar': + pbar.close() + elif progress_style == 'bar': + sys.stderr.flush() + + Runner._print_final_stats(epoch + 1, runtime, datatime, cum_stats) + + if return_predictions: + return predictions + else: + return cum_stats.f1() + + @staticmethod + def train(model, + train_dataset, + validation_dataset, + best_save_path, + epochs=30, + criterion=None, + optimizer=None, + pos_neg_ratio=None, + pos_weight=None, + label_smoothing=0.05, + save_every_prefix=None, + save_every_freq=1, + **kwargs): + """run_train(model, train_dataset, validation_dataset, best_save_path,epochs=30, \ + criterion=None, optimizer=None, pos_neg_ratio=None, pos_weight=None, \ + label_smoothing=0.05, save_every_prefix=None, save_every_freq=None, \ + batch_size=32, device=None, progress_style='bar', log_freq=5, \ + sort_in_buckets=None) + + Train a :class:`deepmatcher.MatchingModel` using the specified training set. + Refer to :meth:`deepmatcher.MatchingModel.run_train` for details on + parameters. + + Returns: + float: The best F1 score obtained by the model on the validation dataset. + """ + + model.initialize(train_dataset) + + model._register_train_buffer('optimizer_state', None) + model._register_train_buffer('best_score', None) + model._register_train_buffer('epoch', None) + + if criterion is None: + if pos_weight is not None: + assert pos_weight < 2 + warnings.warn('"pos_weight" parameter is deprecated and will be removed ' + 'in a later release, please use "pos_neg_ratio" instead', + DeprecationWarning) + assert pos_neg_ratio is None + else: + if pos_neg_ratio is None: + pos_neg_ratio = 1 + else: + assert pos_neg_ratio > 0 + pos_weight = 2 * pos_neg_ratio / (1 + pos_neg_ratio) + + neg_weight = 2 - pos_weight + + criterion = SoftNLLLoss(label_smoothing, + torch.Tensor([neg_weight, pos_weight])) + + optimizer = optimizer or Optimizer() + if model.optimizer_state is not None: + model.optimizer.base_optimizer.load_state_dict(model.optimizer_state) + + if model.epoch is None: + epochs_range = range(epochs) + else: + epochs_range = range(model.epoch + 1, epochs) + + if model.best_score is None: + model.best_score = -1 + optimizer.last_acc = model.best_score + + for epoch in epochs_range: + model.epoch = epoch + Runner._run( + 'TRAIN', model, train_dataset, criterion, optimizer, train=True, **kwargs) + + score = Runner._run('EVAL', model, validation_dataset, train=False, **kwargs) + + optimizer.update_learning_rate(score, epoch + 1) + model.optimizer_state = optimizer.base_optimizer.state_dict() + + new_best_found = False + if score > model.best_score: + print('* Best F1:', score) + model.best_score = score + new_best_found = True + + if best_save_path and new_best_found: + print('Saving best model...') + model.save_state(best_save_path) + print('Done.') + + if save_every_prefix is not None and (epoch + 1) % save_every_freq == 0: + print('Saving epoch model...') + save_path = '{prefix}_ep{epoch}.pth'.format( + prefix=save_every_prefix, epoch=epoch + 1) + model.save_state(save_path) + print('Done.') + print('---------------------\n') + + print('Loading best model...') + model.load_state(best_save_path) + print('Training done.') + + return model.best_score + + def eval(model, dataset, **kwargs): + """eval(model, dataset, device=None, batch_size=32, progress_style='bar', log_freq=5, + sort_in_buckets=None) + + Evaluate a :class:`deepmatcher.MatchingModel` on the specified dataset. + Refer to :meth:`deepmatcher.MatchingModel.run_eval` for details on + parameters. + + Returns: + float: The F1 score obtained by the model on the dataset. + """ + return Runner._run('EVAL', model, dataset, **kwargs) + + def predict(model, dataset, output_attributes=False, **kwargs): + """predict(model, dataset, output_attributes=False, device=None, batch_size=32, \ + progress_style='bar', log_freq=5, sort_in_buckets=None) + + Use a :class:`deepmatcher.MatchingModel` to obtain predictions, i.e., match scores + on the specified dataset. + + Returns: + pandas.DataFrame: A pandas DataFrame containing tuple pair IDs (in the "id" + column) and the corresponding match score predictions (in the + "match_score" column). Will also include all attributes in the original + CSV file of the dataset if `output_attributes` is True. + """ + # Create a shallow copy of the model and reset embeddings to use vocab and + # embeddings from new dataset. + model = copy.deepcopy(model) + model._reset_embeddings(dataset.vocabs) + + predictions = Runner._run( + 'PREDICT', model, dataset, return_predictions=True, **kwargs) + pred_table = pd.DataFrame(predictions, columns=(dataset.id_field, 'match_score')) + pred_table = pred_table.set_index(dataset.id_field) + + if output_attributes: + raw_table = pd.read_csv(dataset.path).set_index(dataset.id_field) + raw_table.index = raw_table.index.astype('str') + pred_table = pred_table.join(raw_table) + + return pred_table