diff --git a/sourced/ml/core/algorithms/__init__.py b/sourced/ml/core/algorithms/__init__.py
new file mode 100644
index 0000000..f457888
--- /dev/null
+++ b/sourced/ml/core/algorithms/__init__.py
@@ -0,0 +1,9 @@
+# flake8: noqa
+from sourced.ml.core.algorithms.tf_idf import log_tf_log_idf
+from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag, uast2sequence
+from sourced.ml.core.algorithms.uast_struct_to_bag import UastRandomWalk2Bag, UastSeq2Bag
+from sourced.ml.core.algorithms.uast_inttypes_to_nodes import Uast2QuantizedChildren
+from sourced.ml.core.algorithms.uast_inttypes_to_graphlets import Uast2GraphletBag
+from sourced.ml.core.algorithms.uast_to_role_id_pairs import Uast2RoleIdPairs
+from sourced.ml.core.algorithms.uast_id_distance import Uast2IdLineDistance, Uast2IdTreeDistance
+from sourced.ml.core.algorithms.uast_to_id_sequence import Uast2IdSequence
diff --git a/sourced/ml/core/algorithms/id_embedding.py b/sourced/ml/core/algorithms/id_embedding.py
new file mode 100644
index 0000000..2a8ac40
--- /dev/null
+++ b/sourced/ml/core/algorithms/id_embedding.py
@@ -0,0 +1,42 @@
+import numpy
+
+
+def extract_coocc_matrix(global_shape, word_indices, model):
+    # Stage 1 - extract the tokens, map them to the global vocabulary
+    indices = []
+    mapped_indices = []
+    for i, w in enumerate(model.tokens):
+        gi = word_indices.get(w)
+        if gi is not None:
+            indices.append(i)
+            mapped_indices.append(gi)
+    indices = numpy.array(indices)
+    mapped_indices = numpy.array(mapped_indices)
+    # Stage 2 - sort the matched tokens by the index in the vocabulary
+    order = numpy.argsort(mapped_indices)
+    indices = indices[order]
+    mapped_indices = mapped_indices[order]
+    # Stage 3 - produce the csr_matrix with the matched tokens **only**
+    matrix = model.matrix.tocsr()[indices][:, indices]
+    # Stage 4 - convert this matrix to the global (ccmatrix) coordinates
+    csr_indices = matrix.indices
+    for i, v in enumerate(csr_indices):
+        # Here we use the fact that indices and mapped_indices are in the same order
+        csr_indices[i] = mapped_indices[v]
+    csr_indptr = matrix.indptr
+    new_indptr = [0]
+    for i, v in enumerate(mapped_indices):
+        prev_ptr = csr_indptr[i]
+        ptr = csr_indptr[i + 1]
+
+        # Handle missing rows
+        prev = (mapped_indices[i - 1] + 1) if i > 0 else 0
+        for _ in range(prev, v):
+            new_indptr.append(prev_ptr)
+
+        new_indptr.append(ptr)
+    for _ in range(mapped_indices[-1] + 1, global_shape[0]):
+        new_indptr.append(csr_indptr[-1])
+    matrix.indptr = numpy.array(new_indptr)
+    matrix._shape = global_shape
+    return matrix
diff --git a/sourced/ml/core/algorithms/id_splitter/README.md b/sourced/ml/core/algorithms/id_splitter/README.md
new file mode 100644
index 0000000..695225b
--- /dev/null
+++ b/sourced/ml/core/algorithms/id_splitter/README.md
@@ -0,0 +1,128 @@
+# Neural Identifier Splitter
+Article [Splitting source code identifiers using Bidirectional LSTM Recurrent Neural Network](https://arxiv.org/abs/1805.11651).
+
+### Agenda
+* Data
+* Training pipeline
+* How to launch
+
+### Data
+You can download the dataset [here](https://drive.google.com/open?id=1wZR5zF1GL1fVcA1gZuAN_9rSLd5ssqKV). More information about the dataset is available [here](https://github.com/src-d/datasets/tree/master/Identifiers).
+#### Data format
+* format of file: `.csv.gz`.
+* the `csv` structure:
+
+|num_files|num_occ|num_repos|token|token_split|
+|:--|:--|:--|:--|:--|
+|1|2|1|quesesSet|queses set|
+|...|...|...|...|...|
+
+#### Data stats
+* 49 millions of identifiers
+* 1 GB
+
+### Training pipeline
+Training pipeline consists of several steps
+* [prepare features](https://github.com/src-d/ml/blob/master/sourced/ml/algorithms/id_splitter/features.py#L44-#L118) - read data, extract features, train/test split
+* [prepare generators for keras](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L34-#L48)
+* [prepare model - RNN or CNN](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L53-#L76)
+* [training](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L78-#L89)
+* [quality report and save the model](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L91-#L96)
+
+### How to launch
+First of all you need to download data using link above.
+
+Usage:
+```console
+usage: srcml train-id-split [-h] -i INPUT [-e EPOCHS] [-b BATCH_SIZE]
+                            [-l LENGTH] -o OUTPUT [-t TEST_RATIO]
+                            [-p {pre,post}] [--optimizer {RMSprop,Adam}]
+                            [--lr LR] [--final-lr FINAL_LR]
+                            [--samples-before-report SAMPLES_BEFORE_REPORT]
+                            [--val-batch-size VAL_BATCH_SIZE] [--seed SEED]
+                            [--devices DEVICES]
+                            [--csv-identifier CSV_IDENTIFIER]
+                            [--csv-identifier-split CSV_IDENTIFIER_SPLIT]
+                            [--include-csv-header] --model {RNN,CNN}
+                            [-s STACK]
+                            [--type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}]
+                            [-n NEURONS] [-f FILTERS] [-k KERNEL_SIZES]
+                            [--dim-reduction DIM_REDUCTION]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        Path to the input data in CSV
+                        format:num_files,num_occ,num_repos,token,token_split
+  -e EPOCHS, --epochs EPOCHS
+                        Number of training epochs. The more the betterbut the
+                        training time is proportional. (default: 10)
+  -b BATCH_SIZE, --batch-size BATCH_SIZE
+                        Batch size. Higher values better utilize GPUsbut may
+                        harm the convergence. (default: 500)
+  -l LENGTH, --length LENGTH
+                        RNN sequence length. (default: 40)
+  -o OUTPUT, --output OUTPUT
+                        Path to store the trained model.
+  -t TEST_RATIO, --test-ratio TEST_RATIO
+                        Fraction of the dataset to use for evaluation.
+                        (default: 0.2)
+  -p {pre,post}, --padding {pre,post}
+                        Whether to pad before or after each sequence.
+                        (default: post)
+  --optimizer {RMSprop,Adam}
+                        Algorithm to use as an optimizer for the neural net.
+                        (default: Adam)
+  --lr LR               Initial learning rate. (default: 0.001)
+  --final-lr FINAL_LR   Final learning rate. The decrease from the initial
+                        learning rate is done linearly. (default: 1e-05)
+  --samples-before-report SAMPLES_BEFORE_REPORT
+                        Number of samples between each validation reportand
+                        training updates. (default: 5000000)
+  --val-batch-size VAL_BATCH_SIZE
+                        Batch size for validation.It can be increased to speed
+                        up the pipeline butit proportionally increases the
+                        memory consumption. (default: 2000)
+  --seed SEED           Random seed. (default: 1989)
+  --devices DEVICES     Device(s) to use. '-1' means CPU. (default: 0)
+  --csv-identifier CSV_IDENTIFIER
+                        Column name in the CSV file for the raw identifier.
+                        (default: 3)
+  --csv-identifier-split CSV_IDENTIFIER_SPLIT
+                        Column name in the CSV file for the splitidentifier.
+                        (default: 4)
+  --include-csv-header  Treat the first line of the input CSV as a
+                        regularline. (default: False)
+  --model {RNN,CNN}     Neural Network model to use to learn the
+                        identifiersplitting task.
+  -s STACK, --stack STACK
+                        Number of layers stacked on each other. (default: 2)
+  --type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}
+                        Recurrent layer type to use. (default: LSTM)
+  -n NEURONS, --neurons NEURONS
+                        Number of neurons on each layer. (default: 256)
+  -f FILTERS, --filters FILTERS
+                        Number of filters for each kernel size. (default:
+                        64,32,16,8)
+  -k KERNEL_SIZES, --kernel-sizes KERNEL_SIZES
+                        Sizes for sliding windows. (default: 2,4,8,16)
+  --dim-reduction DIM_REDUCTION
+                        Number of 1-d kernels to reduce dimensionalityafter
+                        each layer. (default: 32)
+```
+
+
+Examples of commands:
+1) Train RNN with LSTM cells
+```console
+srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output
+```
+2) Train RNN with CuDNNLSTM cells
+```console
+srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output  \
+--type-cell CuDNNLSTM
+```
+3) Train CNN
+```console
+srcml train-id-split --model CNN --input /path/to/input.csv.gz --output /path/to/output
+```
diff --git a/sourced/ml/core/algorithms/id_splitter/__init__.py b/sourced/ml/core/algorithms/id_splitter/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sourced/ml/core/algorithms/id_splitter/features.py b/sourced/ml/core/algorithms/id_splitter/features.py
new file mode 100644
index 0000000..0333dd8
--- /dev/null
+++ b/sourced/ml/core/algorithms/id_splitter/features.py
@@ -0,0 +1,118 @@
+import logging
+import string
+import tarfile
+from typing import List, Tuple
+
+from modelforge.progress_bar import progress_bar
+import numpy
+
+
+def read_identifiers(csv_path: str, use_header: bool, max_identifier_len: int, identifier_col: int,
+                     split_identifier_col: int, shuffle: bool = True) -> List[str]:
+    """
+    Reads and filters too long identifiers in the CSV file.
+
+    :param csv_path: path to the CSV file.
+    :param use_header: uses header as normal line (True) or treat as header line with column names.
+    :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer.
+    :param identifier_col: column name in the CSV file for the raw identifier.
+    :param split_identifier_col: column name in the CSV file for the split identifier lowercase.
+    :param shuffle: indicates whether to reorder the list of identifiers
+        at random after reading it.
+    :return: list of split identifiers.
+    """
+    log = logging.getLogger("read_identifiers")
+    log.info("Reading data from the CSV file %s", csv_path)
+    identifiers = []
+    # TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done
+    # Think about dataset download step
+    with tarfile.open(csv_path, encoding="utf-8") as f:
+        assert len(f.members) == 1, "One archived file is expected, got: %s" % len(f.members)
+        content = f.extractfile(f.members[0])
+        if not use_header:
+            content.readline()
+        for line in progress_bar(content.readlines(), log):
+            row = line.decode("utf-8").strip().split(",")
+            if len(row[identifier_col]) <= max_identifier_len:
+                identifiers.append(row[split_identifier_col])
+    if shuffle:
+        numpy.random.shuffle(identifiers)
+    log.info("Number of identifiers after filtering: %s." % len(identifiers))
+    return identifiers
+
+
+def prepare_features(csv_path: str, use_header: bool, max_identifier_len: int,
+                     identifier_col: int, split_identifier_col: int, test_ratio: float,
+                     padding: str, shuffle: bool = True) -> Tuple[numpy.array]:
+    """
+    Prepare the features to train the identifier splitting task.
+
+    :param csv_path: path to the CSV file.
+    :param use_header: uses header as normal line (True) or treat as header line with column names.
+    :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer.
+    :param identifier_col: column in the CSV file for the raw identifier.
+    :param split_identifier_col: column in the CSV file for the split identifier.
+    :param shuffle: indicates whether to reorder the list of identifiers
+        at random after reading it.
+    :param test_ratio: Proportion of test samples used for evaluation.
+    :param padding: position where to add padding values:
+        after the intput sequence if "post", before if "pre".
+    :return: training and testing features to train the neural net for the splitting task.
+    """
+    from keras.preprocessing.sequence import pad_sequences
+    log = logging.getLogger("prepare_features")
+
+    # read data from the input file
+    identifiers = read_identifiers(csv_path=csv_path, use_header=use_header,
+                                   max_identifier_len=max_identifier_len,
+                                   identifier_col=identifier_col,
+                                   split_identifier_col=split_identifier_col, shuffle=shuffle)
+
+    log.info("Converting identifiers to character indices")
+    log.info("Number of identifiers: %d, Average length: %d characters" %
+             (len(identifiers), numpy.mean([len(i) for i in identifiers])))
+
+    char2ind = {c: i + 1 for i, c in enumerate(sorted(string.ascii_lowercase))}
+
+    char_id_seq = []
+    splits = []
+    for identifier in identifiers:
+        # iterate through the identifier and convert to array of char indices & boolean split array
+        index_arr = []
+        split_arr = []
+        skip_char = False
+        for char in identifier.strip():
+            if char in char2ind:
+                index_arr.append(char2ind[char])
+                if skip_char:
+                    skip_char = False
+                    continue
+                split_arr.append(0)
+            elif char == " ":
+                split_arr.append(1)
+                skip_char = True
+            else:
+                log.warning("Unexpected symbol %s in identifier", char)
+        assert len(index_arr) == len(split_arr)
+        char_id_seq.append(index_arr)
+        splits.append(split_arr)
+
+    log.info("Number of subtokens: %d, Number of distinct characters: %d" %
+             (sum(sum(split_arr) for split_arr in splits) + len(identifiers),
+              len({i for index_arr in char_id_seq for i in index_arr})))
+
+    log.info("Train/test splitting...")
+    n_train = int((1 - test_ratio) * len(char_id_seq))
+    X_train = char_id_seq[:n_train]
+    X_test = char_id_seq[n_train:]
+    y_train = splits[:n_train]
+    y_test = splits[n_train:]
+    log.info("Number of train samples: %s, number of test samples: %s" % (len(X_train),
+                                                                          len(X_test)))
+    log.info("Padding the sequences...")
+    X_train = pad_sequences(X_train, maxlen=max_identifier_len, padding=padding)
+    X_test = pad_sequences(X_test, maxlen=max_identifier_len, padding=padding)
+    y_train = pad_sequences(y_train, maxlen=max_identifier_len, padding=padding)
+    y_test = pad_sequences(y_test, maxlen=max_identifier_len, padding=padding)
+
+    return X_train, X_test, y_train[:, :, None], y_test[:, :, None]
diff --git a/sourced/ml/core/algorithms/id_splitter/nn_model.py b/sourced/ml/core/algorithms/id_splitter/nn_model.py
new file mode 100644
index 0000000..e917493
--- /dev/null
+++ b/sourced/ml/core/algorithms/id_splitter/nn_model.py
@@ -0,0 +1,243 @@
+import string
+from typing import Callable, List, Tuple, Union
+import warnings
+
+import keras
+from keras import backend as kbackend
+from keras.layers import (
+    BatchNormalization, Concatenate, Conv1D, Dense, Embedding, Input, TimeDistributed)
+from keras.models import Model
+import numpy
+try:
+    import tensorflow as tf
+except ImportError:
+    warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.")
+
+
+LOSS = "binary_crossentropy"
+METRICS = ["accuracy"]
+# Number of unique characters and dimension of the embedding layer
+NUM_CHARS = len(string.ascii_lowercase)
+
+
+def register_metric(metric: Union[str, Callable]) -> Union[str, Callable]:
+    """
+    Decorator function to register the metrics in the METRICS constant.
+
+    :param metric: name of the tensorflow metric or custom function metric.
+    :return: the metric.
+    """
+    assert isinstance(metric, str) or callable(metric)
+    METRICS.append(metric)
+    return metric
+
+
+def prepare_devices(devices: str) -> Tuple[str]:
+    """
+    Extract devices from arguments.
+
+    :param devices: devices to use passed as one string argument.
+    :return: split devices.
+    """
+    devices = devices.split(",")
+    if len(devices) == 2:
+        dev0, dev1 = ("/gpu:" + dev for dev in devices)
+    elif len(devices) == 1:
+        if int(devices[0]) != -1:
+            dev0 = dev1 = "/gpu:" + devices[0]
+        else:
+            dev0 = dev1 = "/cpu:0"
+    else:
+        raise ValueError("Expected 1 or 2 devices but got %d from the devices argument %s" %
+                         (len(devices), devices))
+    return dev0, dev1
+
+
+def prepare_input_emb(maxlen: int) -> Tuple[tf.Tensor]:
+    """
+    Builds character embeddings, a dense representation of characters to feed the RNN with.
+
+    :param maxlen: maximum length of the input sequence.
+    :return: input and one-hot character embedding layer.
+    """
+    char_seq = Input((maxlen,))
+    emb = Embedding(input_dim=NUM_CHARS + 1, output_dim=NUM_CHARS + 1, input_length=maxlen,
+                    mask_zero=False, weights=[numpy.eye(NUM_CHARS + 1)], trainable=False)(char_seq)
+    return char_seq, emb
+
+
+def add_output_layer(hidden_layer: tf.Tensor) -> keras.layers.wrappers.TimeDistributed:
+    """
+    Applies a Dense layer to each of the timestamps of a hidden layer, independently.
+    The output layer has 1 sigmoid per character which predicts if there is a space or not
+    before the character.
+
+    :param hidden_layer: hidden layer before the output layer.
+    :return: output layer.
+    """
+    norm_input = BatchNormalization()(hidden_layer)
+    return TimeDistributed(Dense(1, activation="sigmoid"))(norm_input)
+
+
+def add_rnn(X: tf.Tensor, units: int, rnn_layer: str, dev0: str = "/gpu:0",
+            dev1: str = "/gpu:1") -> tf.Tensor:
+    """
+    Adds a bidirectional RNN layer with the specified parameters.
+
+    :param X: input layer.
+    :param units: number of neurons in the output layer.
+    :param rnn_layer: type of cell in the RNN.
+    :param dev0: device that will be used as forward pass of RNN and concatenation.
+    :param dev1: device that will be used as backward pass.
+    :return: output bidirectional RNN layer.
+    """
+    # select the type of RNN layer
+    rnn_layer = getattr(keras.layers, rnn_layer)
+
+    # add the forward & backward RNN
+    with tf.device(dev0):
+        forward = rnn_layer(units=units, return_sequences=True)(X)
+    with tf.device(dev1):
+        backward = rnn_layer(units=units, return_sequences=True, go_backwards=True)(X)
+
+    # concatenate
+    with tf.device(dev1):
+        bidi = Concatenate(axis=-1)([forward, backward])
+    return bidi
+
+
+def build_rnn(maxlen: int, units: int, stack: int, optimizer: str, dev0: str,
+              dev1: str, rnn_layer: str) -> keras.engine.training.Model:
+    """
+    Builds a RNN model with the parameters specified as arguments.
+
+    :param maxlen: maximum length of the input sequence.
+    :param units: number of neurons or dimensionality of the output RNN.
+    :param stack: number of RNN layers to stack.
+    :param optimizer: algorithm to use as an optimizer for the RNN.
+    :param rnn_layer: recurrent layer type to use.
+    :param dev0: first device to use when running specific operations.
+    :param dev1: second device to use when running specific operations.
+    :return: compiled RNN model.
+    """
+    # prepare the model
+    with tf.device(dev0):
+        char_seq, hidden_layer = prepare_input_emb(maxlen)
+
+        # stack the BiDi-RNN layers
+        for _ in range(stack):
+            hidden_layer = add_rnn(hidden_layer, units=units, rnn_layer=rnn_layer,
+                                   dev0=dev0, dev1=dev1)
+        output = add_output_layer(hidden_layer)
+
+    # compile the model
+    model = Model(inputs=char_seq, outputs=output)
+    model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS)
+    return model
+
+
+def add_conv(X: tf.Tensor, filters: List[int], kernel_sizes: List[int],
+             output_n_filters: int) -> tf.Tensor:
+    """
+    Builds a single convolutional layer.
+
+    :param X: input layer.
+    :param filters: number of output filters in the convolution.
+    :param kernel_sizes: list of lengths of the 1D convolution window.
+    :param output_n_filters: number of 1D output filters.
+    :return: output layer.
+    """
+    # normalize the input
+    X = BatchNormalization()(X)
+
+    # add convolutions
+    convs = []
+
+    for n_filters, kernel_size in zip(filters, kernel_sizes):
+        conv = Conv1D(filters=n_filters, kernel_size=kernel_size, padding="same",
+                      activation="relu")
+        convs.append(conv(X))
+
+    # concatenate all convolutions
+    conc = Concatenate(axis=-1)(convs)
+    conc = BatchNormalization()(conc)
+
+    # dimensionality reduction
+    conv = Conv1D(filters=output_n_filters, kernel_size=1, padding="same", activation="relu")
+    return conv(conc)
+
+
+def build_cnn(maxlen: int, filters: List[int], output_n_filters: int, stack: int,
+              kernel_sizes: List[int], optimizer: str, device: str) -> keras.engine.training.Model:
+    """
+    Builds a CNN model with the parameters specified as arguments.
+
+    :param maxlen: maximum length of the input sequence.
+    :param filters: number of output filters in the convolution.
+    :param output_n_filters: number of 1d output filters.
+    :param stack: number of CNN layers to stack.
+    :param kernel_sizes: list of lengths of the 1D convolution window.
+    :param optimizer: algorithm to use as an optimizer for the CNN.
+    :param device: device to use when running specific operations.
+    :return: compiled CNN model.
+    """
+    # prepare the model
+    with tf.device(device):
+        char_seq, hidden_layer = prepare_input_emb(maxlen)
+
+        # stack the CNN layers
+        for _ in range(stack):
+            hidden_layer = add_conv(hidden_layer, filters=filters, kernel_sizes=kernel_sizes,
+                                    output_n_filters=output_n_filters)
+        output = add_output_layer(hidden_layer)
+
+    # compile the model
+    model = Model(inputs=char_seq, outputs=output)
+    model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS)
+    return model
+
+
+@register_metric
+def precision(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+    """
+    Computes the precision, a metric for multi-label classification of
+    how many selected items are relevant.
+
+    :param y_true: tensor of true labels.
+    :param y_pred: tensor of predicted labels.
+    :return: a tensor batch-wise average of precision.
+    """
+    true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1)))
+    predicted_positives = kbackend.sum(kbackend.round(kbackend.clip(y_pred, 0, 1)))
+    precision = true_positives / (predicted_positives + kbackend.epsilon())
+    return precision
+
+
+@register_metric
+def recall(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+    """
+    Computes the recall, a metric for multi-label classification of
+    how many relevant items are selected.
+
+    :param y_true: tensor of true labels.
+    :param y_pred: tensor of predicted labels.
+    :return: a tensor batch-wise average of recall.
+    """
+    true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1)))
+    possible_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true, 0, 1)))
+    recall = true_positives / (possible_positives + kbackend.epsilon())
+    return recall
+
+
+@register_metric
+def f1score(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
+    """
+    Computes the F1 score,  the harmonic average of precision and recall.
+
+    :param y_true: tensor of true labels.
+    :param y_pred: tensor of predicted labels.
+    :return: a tensor batch-wise average of F1 score.
+    """
+    prec = precision(y_true, y_pred)
+    rec = recall(y_true, y_pred)
+    return 2 * prec * rec / (prec + rec + kbackend.epsilon())
diff --git a/sourced/ml/core/algorithms/id_splitter/pipeline.py b/sourced/ml/core/algorithms/id_splitter/pipeline.py
new file mode 100644
index 0000000..debe27a
--- /dev/null
+++ b/sourced/ml/core/algorithms/id_splitter/pipeline.py
@@ -0,0 +1,222 @@
+from datetime import datetime
+import logging
+import os
+import random
+from typing import Callable, Iterable, List, Tuple
+import warnings
+
+import keras
+from keras import backend as kbackend
+from keras.callbacks import CSVLogger, LearningRateScheduler, ModelCheckpoint, TensorBoard
+import numpy
+try:
+    import tensorflow as tf
+except ImportError:
+    warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.")
+
+
+# additional variable to avoid any division by zero when computing the precision and recall metrics
+EPSILON = 10 ** -8
+# threshold that is used to binarize predictions of the model
+DEFAULT_THRESHOLD = 0.5
+
+
+def set_random_seed(seed: int) -> None:
+    """
+    Fixes a random seed for reproducibility.
+
+    :param seed: seed value.
+    """
+    numpy.random.seed(seed)
+    random.seed(seed)
+    tf.set_random_seed(seed)
+
+
+def binarize(matrix: numpy.array, threshold: float, inplace: bool = True) -> numpy.array:
+    """
+    Helper function to binarize a matrix.
+
+    :param matrix: matrix as a numpy.array.
+    :param threshold: if value >= threshold then the value will be 1, else 0.
+    :param inplace: whether to modify the matrix inplace or not.
+    :return: the binarized matrix.
+    """
+    mask = matrix >= threshold
+    if inplace:
+        matrix_ = matrix
+    else:
+        matrix_ = matrix.copy()
+    matrix_[mask] = 1
+    matrix_[numpy.logical_not(mask)] = 0
+    return matrix_
+
+
+def str2ints(params: str) -> List[int]:
+    """
+    Convert a string with integer parameters to a list of integers.
+
+    :param params: string that contains integer parameters separated by commas.
+    :return: list of integers.
+    """
+    return list(map(int, params.split(",")))
+
+
+def precision_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float:
+    """
+    Computes the precision metric, a metric for multi-label classification of
+    how many selected items are relevant.
+
+    :param y_true: ground truth labels - expect binary values.
+    :param y_pred: predicted labels - expect binary values.
+    :param epsilon: added to the denominator to avoid any division by zero.
+    :return: precision metric.
+    """
+    true_positives = numpy.sum(y_true * y_pred)
+    predicted_positives = numpy.sum(y_pred)
+    return true_positives / (predicted_positives + epsilon)
+
+
+def recall_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float:
+    """
+    Computes the recall metric, a metric for multi-label classification of
+    how many relevant items are selected.
+
+    :param y_true: matrix with ground truth labels - expect binary values.
+    :param y_pred: matrix with predicted labels - expect binary values.
+    :param epsilon: added to the denominator to avoid any division by zero.
+    :return: recall metric.
+    """
+    true_positives = numpy.sum(y_true * y_pred)
+    possible_positives = numpy.sum(y_true)
+    return true_positives / (possible_positives + epsilon)
+
+
+def report(model: keras.engine.training.Model, X: numpy.array, y: numpy.array, batch_size: int,
+           threshold: float = DEFAULT_THRESHOLD, epsilon: float = EPSILON) -> None:
+    """
+    Prints a metric report of the `model` on the  data `X` & `y`.
+    The metrics printed are precision, recall, F1 score.
+
+    :param model: model considered.
+    :param X: features.
+    :param y: labels (expected binary labels).
+    :param batch_size: batch size that will be used for prediction.
+    :param threshold: threshold to binarize the predictions.
+    :param epsilon: added to the denominator to avoid any division by zero.
+    """
+    log = logging.getLogger("report")
+
+    # predict & skip the last dimension & binarize
+    predictions = model.predict(X, batch_size=batch_size, verbose=1)[:, :, 0]
+    predictions = binarize(predictions, threshold)
+
+    # report
+    pr = precision_np(y[:, :, 0], predictions, epsilon=epsilon)
+    rec = recall_np(y[:, :, 0], predictions, epsilon=epsilon)
+    f1 = 2 * pr * rec / (pr + rec + epsilon)
+    log.info("precision: %.3f, recall: %.3f, f1: %.3f" % (pr, rec, f1))
+
+
+def config_keras() -> None:
+    """
+    Initializes keras backend session.
+    """
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    kbackend.tensorflow_backend.set_session(tf.Session(config=config))
+
+
+def build_train_generator(X: numpy.array, y: numpy.array,
+                          batch_size: int = 500) -> Iterable[Tuple[numpy.array]]:
+    """
+    Builds the generator that yields features and their labels.
+
+    :param X: features.
+    :param y: binary labels.
+    :param batch_size: higher values better utilize GPUs.
+    :return: generator of features and their labels.
+    """
+    assert X.shape[0] == y.shape[0], "Number of samples mismatch in X and y."
+
+    def xy_generator():
+        while True:
+            n_batches = X.shape[0] // batch_size
+            if n_batches * batch_size < X.shape[0]:
+                n_batches += 1  # to yield last samples
+            for i in range(n_batches):
+                start = i * batch_size
+                end = min((i + 1) * batch_size, X.shape[0])
+                yield X[start:end], y[start:end]
+    return xy_generator()
+
+
+def build_schedule(lr: float, final_lr: float, n_epochs: int) -> Callable:
+    """
+    Builds the schedule of which the learning rate decreases.
+    The schedule makes the learning rate decrease linearly.
+
+    :param lr: initial learning rate.
+    :param final_lr: final learning rate.
+    :param n_epochs: number of training epochs.
+    :return: the schedule of the learning rate.
+    """
+    delta = (lr - final_lr) / n_epochs
+
+    def schedule(epoch: int) -> float:
+        assert 0 <= epoch < n_epochs
+        return lr - delta * epoch
+    return schedule
+
+
+def make_lr_scheduler(lr: float, final_lr: float, n_epochs: int,
+                      verbose: int = 1) -> keras.callbacks.LearningRateScheduler:
+    """
+    Prepares the scheduler to decrease the learning rate while training.
+
+    :param lr: initial learning rate.
+    :param final_lr: final learning rate.
+    :param n_epochs: number of training epochs.
+    :param verbose: level of verbosity.
+    :return: LearningRateScheduler with linear schedule of the learning rate.
+    """
+    schedule = build_schedule(lr, final_lr, n_epochs)
+    return LearningRateScheduler(schedule=schedule, verbose=verbose)
+
+
+def prepare_callbacks(output_dir: str) -> Tuple[Callable]:
+    """
+    Prepares logging, tensorboard, model checkpoint callbacks and stores the outputs in output_dir.
+
+    :param output_dir: path to the results.
+    :return: list of callbacks.
+    """
+    time = datetime.now().strftime("%y%m%d-%H%M")
+    log_dir = os.path.join(output_dir, "tensorboard" + time)
+    logging.info("Tensorboard directory: %s" % log_dir)
+    tensorboard = TensorBoard(log_dir=log_dir, batch_size=1000, write_images=True,
+                              write_graph=True)
+    csv_path = os.path.join(output_dir, "csv_logger_" + time + ".txt")
+    logging.info("CSV logs: %s" % csv_path)
+    csv_logger = CSVLogger(csv_path)
+
+    filepath = os.path.join(output_dir, "best_" + time + ".model")
+    model_saver = ModelCheckpoint(filepath, monitor="val_recall", verbose=1, save_best_only=True,
+                                  mode="max")
+    return tensorboard, csv_logger, model_saver
+
+
+def create_generator_params(batch_size: int, samples_per_epoch: int, n_samples: int,
+                            epochs: int) -> Tuple[int]:
+    """
+    Helper function to split a huge dataset into smaller ones to enable more frequent reports.
+
+    :param batch_size: batch size.
+    :param samples_per_epoch: number of samples per mini-epoch or before each report.
+    :param n_samples: total number of samples.
+    :param epochs: number of epochs over the full dataset.
+    :return: number of steps per epoch (should be used with the generator) and number of sub-epochs
+             where during sub-epoch only samples_per_epoch will be generated.
+    """
+    steps_per_epoch = samples_per_epoch // batch_size
+    n_epochs = numpy.ceil(epochs * n_samples / samples_per_epoch)
+    return steps_per_epoch, n_epochs
diff --git a/sourced/ml/core/algorithms/swivel.py b/sourced/ml/core/algorithms/swivel.py
new file mode 100644
index 0000000..bfd126f
--- /dev/null
+++ b/sourced/ml/core/algorithms/swivel.py
@@ -0,0 +1,491 @@
+#!/usr/bin/env python3
+#
+# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2017 Sourced Technologies S. L.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Submatrix-wise Vector Embedding Learner.
+
+Implementation of SwiVel algorithm described at:
+http://arxiv.org/abs/1602.02215
+
+This program expects an input directory that contains the following files.
+
+  row_vocab.txt, col_vocab.txt
+
+    The row an column vocabulary files.  Each file should contain one token per
+    line; these will be used to generate a tab-separate file containing the
+    trained embeddings.
+
+  row_sums.txt, col_sum.txt
+
+    The matrix row and column marginal sums.  Each file should contain one
+    decimal floating point number per line which corresponds to the marginal
+    count of the matrix for that row or column.
+
+  shards.recs
+
+    A file containing the sub-matrix shards, stored as TFRecords. Each shard is
+    expected to be a serialzed tf.Example protocol buffer with the following
+    properties:
+
+      global_row: the global row indices contained in the shard
+      global_col: the global column indices contained in the shard
+      sparse_local_row, sparse_local_col, sparse_value: three parallel arrays
+      that are a sparse representation of the submatrix counts.
+
+It will generate embeddings, training from the input directory for
+the specified number of epochs.  When complete, it will output the trained
+vectors to a tab-separated file that contains one line per embedding.  Row and
+column embeddings are stored in separate files.
+
+"""
+
+import glob
+import math
+import os
+import threading
+import time
+
+import numpy
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+
+flags = tf.app.flags
+
+flags.DEFINE_string("input_base_path", None,
+                    "Directory containing input shards, vocabularies, "
+                    "and marginals.")
+flags.DEFINE_string("output_base_path", None,
+                    "Path where to write the trained embeddings.")
+flags.DEFINE_integer("embedding_size", 300, "Size of the embeddings")
+flags.DEFINE_boolean("trainable_bias", False, "Biases are trainable")
+flags.DEFINE_integer("submatrix_rows", 4096,
+                     "Rows in each training submatrix. This must match "
+                     "the training data.")
+flags.DEFINE_integer("submatrix_cols", 4096,
+                     "Rows in each training submatrix. This must match "
+                     "the training data.")
+flags.DEFINE_float("loss_multiplier", 1.0 / 4096,
+                   "constant multiplier on loss.")
+flags.DEFINE_float("confidence_exponent", 0.5,
+                   "Exponent for l2 confidence function")
+flags.DEFINE_float("confidence_scale", 0.25,
+                   "Scale for l2 confidence function")
+flags.DEFINE_float("confidence_base", 0.1, "Base for l2 confidence function")
+flags.DEFINE_float("learning_rate", 1.0, "Initial learning rate")
+flags.DEFINE_string("optimizer", "Adagrad",
+                    "SGD optimizer (tf.train.*Optimizer)")
+flags.DEFINE_integer("num_concurrent_steps", 2,
+                     "Number of threads to train with")
+flags.DEFINE_integer("num_readers", 4,
+                     "Number of threads to read the input data and feed it")
+flags.DEFINE_float("num_epochs", 40, "Number epochs to train for")
+flags.DEFINE_float("per_process_gpu_memory_fraction", 0,
+                   "Fraction of GPU memory to use, 0 means allow_growth")
+flags.DEFINE_integer("num_gpus", 0,
+                     "Number of GPUs to use, 0 means all available")
+flags.DEFINE_string("logs", "",
+                    "Path for TensorBoard logs (empty value disables them)")
+
+FLAGS = flags.FLAGS
+
+
+def log(message, *args, **kwargs):
+    tf.logging.info(message, *args, **kwargs)
+
+
+def get_available_gpus():
+    return [d.name for d in device_lib.list_local_devices()
+            if d.device_type == "GPU"]
+
+
+def embeddings_with_init(vocab_size, embedding_dim, name):
+    """Creates and initializes the embedding tensors."""
+    return tf.get_variable(name=name,
+                           shape=[vocab_size, embedding_dim],
+                           initializer=tf.random_normal_initializer(
+                               stddev=math.sqrt(1.0 / embedding_dim)))
+
+
+def count_matrix_input(filenames, submatrix_rows, submatrix_cols):
+    """Reads submatrix shards from disk."""
+    filename_queue = tf.train.string_input_producer(filenames)
+    reader = tf.WholeFileReader()
+    _, serialized_example = reader.read(filename_queue)
+    features = tf.parse_single_example(
+        serialized_example,
+        features={
+            "global_row": tf.FixedLenFeature([submatrix_rows], dtype=tf.int64),
+            "global_col": tf.FixedLenFeature([submatrix_cols], dtype=tf.int64),
+            "sparse_local_row": tf.VarLenFeature(dtype=tf.int64),
+            "sparse_local_col": tf.VarLenFeature(dtype=tf.int64),
+            "sparse_value": tf.VarLenFeature(dtype=tf.float32)
+        })
+
+    global_row = features["global_row"]
+    global_col = features["global_col"]
+
+    sparse_local_row = features["sparse_local_row"].values
+    sparse_local_col = features["sparse_local_col"].values
+    sparse_count = features["sparse_value"].values
+
+    sparse_indices = tf.concat(axis=1, values=[tf.expand_dims(sparse_local_row, 1),
+                                               tf.expand_dims(sparse_local_col, 1)])
+    count = tf.sparse_to_dense(sparse_indices, [submatrix_rows, submatrix_cols],
+                               sparse_count, validate_indices=False)
+
+    queued_global_row, queued_global_col, queued_count = tf.train.batch(
+        [global_row, global_col, count],
+        batch_size=1,
+        num_threads=FLAGS.num_readers,
+        capacity=32)
+
+    queued_global_row = tf.reshape(queued_global_row, [submatrix_rows])
+    queued_global_col = tf.reshape(queued_global_col, [submatrix_cols])
+    queued_count = tf.reshape(queued_count, [submatrix_rows, submatrix_cols])
+
+    return queued_global_row, queued_global_col, queued_count
+
+
+def read_marginals_file(filename):
+    """Reads text file with one number per line to an array."""
+    with open(filename) as lines:
+        return [float(line) for line in lines]
+
+
+def write_embedding_tensor_to_disk(vocab_path, output_path, sess, embedding):
+    """Writes tensor to output_path as tsv"""
+    # Fetch the embedding values from the model
+    embeddings = sess.run(embedding)
+
+    with open(output_path, "w") as out_f:
+        with open(vocab_path) as vocab_f:
+            for index, word in enumerate(vocab_f):
+                word = word.strip()
+                embedding = embeddings[index]
+                out_f.write(word + "\t" + "\t".join(
+                    [str(x) for x in embedding]) + "\n")
+
+
+def write_embeddings_to_disk(config, model, sess):
+    """Writes row and column embeddings disk"""
+    # Row Embedding
+    row_vocab_path = config.input_base_path + "/row_vocab.txt"
+    row_embedding_output_path = config.output_base_path + "/row_embedding.tsv"
+    log("Writing row embeddings to: %s", row_embedding_output_path)
+    write_embedding_tensor_to_disk(row_vocab_path, row_embedding_output_path,
+                                   sess, model.row_embedding)
+
+    # Column Embedding
+    col_vocab_path = config.input_base_path + "/col_vocab.txt"
+    col_embedding_output_path = config.output_base_path + "/col_embedding.tsv"
+    log("Writing column embeddings to: %s", col_embedding_output_path)
+    write_embedding_tensor_to_disk(col_vocab_path, col_embedding_output_path,
+                                   sess, model.col_embedding)
+
+
+class SwivelModel:
+    """Small class to gather needed pieces from a Graph being built."""
+
+    def __init__(self, config):
+        """Construct graph for dmc."""
+        self._config = config
+
+        # Create paths to input data files
+        log("Reading model from: %s", config.input_base_path)
+        count_matrix_files = glob.glob(os.path.join(config.input_base_path, "shard-*.pb"))
+        row_sums_path = os.path.join(config.input_base_path, "row_sums.txt")
+        col_sums_path = os.path.join(config.input_base_path, "col_sums.txt")
+
+        # Read marginals
+        row_sums = read_marginals_file(row_sums_path)
+        col_sums = read_marginals_file(col_sums_path)
+
+        self.n_rows = len(row_sums)
+        self.n_cols = len(col_sums)
+        log("Matrix dim: (%d,%d) SubMatrix dim: (%d,%d)",
+            self.n_rows, self.n_cols, config.submatrix_rows,
+            config.submatrix_cols)
+        if self.n_cols < config.submatrix_cols:
+            raise ValueError(
+                "submatrix_cols={0} can not be bigger than columns number={1} "
+                "(specify submatrix_cols={1})".format(config.submatrix_cols, self.n_cols))
+        if self.n_rows < config.submatrix_rows:
+            raise ValueError(
+                "submatrix_rows={0} can not be bigger than rows number={1} "
+                "(specify submatrix_rows={1})".format(config.submatrix_rows, self.n_cols))
+        self.n_submatrices = (self.n_rows * self.n_cols /
+                              (config.submatrix_rows * config.submatrix_cols))
+        log("n_submatrices: %d", self.n_submatrices)
+
+        with tf.device("/cpu:0"):
+            # ===== CREATE VARIABLES ======
+            # Get input
+            global_row, global_col, count = count_matrix_input(
+                count_matrix_files, config.submatrix_rows,
+                config.submatrix_cols)
+
+            # Embeddings
+            self.row_embedding = embeddings_with_init(
+                embedding_dim=config.embedding_size,
+                vocab_size=self.n_rows,
+                name="row_embedding")
+            self.col_embedding = embeddings_with_init(
+                embedding_dim=config.embedding_size,
+                vocab_size=self.n_cols,
+                name="col_embedding")
+            tf.summary.histogram("row_emb", self.row_embedding)
+            tf.summary.histogram("col_emb", self.col_embedding)
+
+            matrix_log_sum = math.log(numpy.sum(row_sums) + 1)
+            row_bias_init = [math.log(x + 1) for x in row_sums]
+            col_bias_init = [math.log(x + 1) for x in col_sums]
+            self.row_bias = tf.Variable(
+                row_bias_init, trainable=config.trainable_bias)
+            self.col_bias = tf.Variable(
+                col_bias_init, trainable=config.trainable_bias)
+            tf.summary.histogram("row_bias", self.row_bias)
+            tf.summary.histogram("col_bias", self.col_bias)
+
+            # Add optimizer
+            l2_losses = []
+            sigmoid_losses = []
+            self.global_step = tf.Variable(0, name="global_step")
+            learning_rate = tf.Variable(config.learning_rate,
+                                        name="learning_rate")
+            opt = getattr(tf.train, FLAGS.optimizer + "Optimizer")(
+                learning_rate)
+            tf.summary.scalar("learning_rate", learning_rate)
+
+            all_grads = []
+
+        devices = ["/gpu:%d" % i for i in range(FLAGS.num_gpus)] \
+            if FLAGS.num_gpus > 0 else get_available_gpus()
+        self.devices_number = len(devices)
+        if not self.devices_number:
+            devices = ["/cpu:0"]
+            self.devices_number = 1
+        for dev in devices:
+            with tf.device(dev):
+                with tf.name_scope(dev[1:].replace(":", "_")):
+                    # ===== CREATE GRAPH =====
+                    # Fetch embeddings.
+                    selected_row_embedding = tf.nn.embedding_lookup(
+                        self.row_embedding, global_row)
+                    selected_col_embedding = tf.nn.embedding_lookup(
+                        self.col_embedding, global_col)
+
+                    # Fetch biases.
+                    selected_row_bias = tf.nn.embedding_lookup(
+                        [self.row_bias], global_row)
+                    selected_col_bias = tf.nn.embedding_lookup(
+                        [self.col_bias], global_col)
+
+                    # Multiply the row and column embeddings to generate
+                    # predictions.
+                    predictions = tf.matmul(
+                        selected_row_embedding, selected_col_embedding,
+                        transpose_b=True)
+
+                    # These binary masks separate zero from non-zero values.
+                    count_is_nonzero = tf.to_float(tf.cast(count, tf.bool))
+                    count_is_zero = 1 - count_is_nonzero
+
+                    objectives = count_is_nonzero * tf.log(count + 1e-30)
+                    objectives -= tf.reshape(
+                        selected_row_bias, [config.submatrix_rows, 1])
+                    objectives -= selected_col_bias
+                    objectives += matrix_log_sum
+
+                    err = predictions - objectives
+
+                    # The confidence function scales the L2 loss based on
+                    # the raw co-occurrence count.
+                    l2_confidence = (
+                        config.confidence_base +
+                        config.confidence_scale * tf.pow(
+                            count, config.confidence_exponent))
+
+                    l2_loss = config.loss_multiplier * tf.reduce_sum(
+                        0.5 * l2_confidence * err * err * count_is_nonzero)
+                    l2_losses.append(tf.expand_dims(l2_loss, 0))
+
+                    sigmoid_loss = config.loss_multiplier * tf.reduce_sum(
+                        tf.nn.softplus(err) * count_is_zero)
+                    sigmoid_losses.append(tf.expand_dims(sigmoid_loss, 0))
+
+                    loss = l2_loss + sigmoid_loss
+                    grads = opt.compute_gradients(loss)
+                    all_grads.append(grads)
+
+        with tf.device("/cpu:0"):
+            # ===== MERGE LOSSES =====
+            l2_loss = tf.reduce_mean(tf.concat(axis=0, values=l2_losses), 0,
+                                     name="l2_loss")
+            sigmoid_loss = tf.reduce_mean(
+                tf.concat(axis=0, values=sigmoid_losses), 0,
+                name="sigmoid_loss")
+            overall_loss = l2_loss + sigmoid_loss
+            average = tf.train.ExponentialMovingAverage(0.999)
+            loss_average_op = average.apply(
+                (overall_loss, l2_loss, sigmoid_loss))
+            self.loss = average.average(overall_loss)
+            tf.summary.scalar("overall_loss", self.loss)
+            tf.summary.scalar("l2_loss", average.average(l2_loss))
+            tf.summary.scalar("sigmoid_loss", average.average(sigmoid_loss))
+
+            # Apply the gradients to adjust the shared variables.
+            apply_gradient_ops = []
+            for grads in all_grads:
+                apply_gradient_ops.append(opt.apply_gradients(
+                    grads, global_step=self.global_step))
+
+            self.train_op = tf.group(loss_average_op, *apply_gradient_ops)
+            self.saver = tf.train.Saver(sharded=True)
+
+    def initialize_summary(self, sess):
+        log("creating TensorBoard stuff...")
+        self.summary = tf.summary.merge_all()
+        self.writer = tf.summary.FileWriter(FLAGS.logs, sess.graph)
+        projector_config = \
+            tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
+        embedding_config = projector_config.embeddings.add()
+        length = min(10000, self.n_rows, self.n_cols)
+        self.embedding10k = tf.Variable(
+            tf.zeros((length, self._config.embedding_size)),
+            name="top10k_embedding")
+        embedding_config.tensor_name = self.embedding10k.name
+        embedding_config.metadata_path = os.path.join(
+            self._config.input_base_path, "row_vocab.txt")
+        tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
+            self.writer, projector_config)
+        self.saver = tf.train.Saver((self.embedding10k,), max_to_keep=1)
+
+    def write_summary(self, sess):
+        log("writing the summary...")
+        length = min(10000, self.n_rows, self.n_cols)
+        assignment = self.embedding10k.assign(
+            (self.row_embedding[:length] + self.col_embedding[:length]) / 2)
+        summary, _, global_step = sess.run(
+            (self.summary, assignment, self.global_step))
+        self.writer.add_summary(summary, global_step)
+        self.saver.save(
+            sess, os.path.join(FLAGS.logs, "embeddings10k.checkpoint"),
+            global_step)
+
+
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+    start_time = time.time()
+
+    omitted = {"handler", "command"}
+
+    log("Swivel parameters:\n" + "\n".join(
+        "\t{:20} {}".format(key, value) for key, value in
+        sorted(FLAGS.__dict__.items()) if key not in omitted))
+    # Create the output path.  If this fails, it really ought to fail now. :)
+    if not os.path.isdir(FLAGS.output_base_path):
+        os.makedirs(FLAGS.output_base_path)
+
+    # Create and run model
+    with tf.Graph().as_default():
+        log("creating the model...")
+        model = SwivelModel(FLAGS)
+
+        # Create a session for running Ops on the Graph.
+        gpu_opts = {}
+        if FLAGS.per_process_gpu_memory_fraction > 0:
+            gpu_opts["per_process_gpu_memory_fraction"] = \
+                FLAGS.per_process_gpu_memory_fraction
+        else:
+            gpu_opts["allow_growth"] = True
+        gpu_options = tf.GPUOptions(**gpu_opts)
+        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+        if FLAGS.logs:
+            model.initialize_summary(sess)
+
+        # Run the Op to initialize the variables.
+        log("initializing the variables...")
+        sess.run(tf.global_variables_initializer())
+
+        # Start feeding input
+        log("starting the input threads...")
+        coord = tf.train.Coordinator()
+        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+
+        # Calculate how many steps each thread should run
+        n_total_steps = int(FLAGS.num_epochs * model.n_rows * model.n_cols) / (
+            FLAGS.submatrix_rows * FLAGS.submatrix_cols)
+        n_steps_per_thread = n_total_steps / (
+            FLAGS.num_concurrent_steps * model.devices_number)
+        n_submatrices_to_train = model.n_submatrices * FLAGS.num_epochs
+        t0 = [time.time()]
+        n_steps_between_status_updates = 100
+        n_steps_between_summary_updates = 10000
+        status_i = [0, 0]
+        status_lock = threading.Lock()
+        msg = ("%%%dd/%%d submatrices trained (%%.1f%%%%), "
+               "%%5.1f submatrices/sec | loss %%f") % \
+            len(str(n_submatrices_to_train))
+
+        def TrainingFn():
+            for _ in range(int(n_steps_per_thread)):
+                _, global_step, loss = sess.run((
+                    model.train_op, model.global_step, model.loss))
+
+                show_status = False
+                update_summary = False
+                with status_lock:
+                    new_i = global_step // n_steps_between_status_updates
+                    if new_i > status_i[0]:
+                        status_i[0] = new_i
+                        show_status = True
+                    new_i = global_step // n_steps_between_summary_updates
+                    if new_i > status_i[1]:
+                        status_i[1] = new_i
+                        update_summary = True
+                if show_status:
+                    elapsed = float(time.time() - t0[0])
+                    log(msg, global_step, n_submatrices_to_train,
+                        100.0 * global_step / n_submatrices_to_train,
+                        n_steps_between_status_updates / elapsed, loss)
+                    t0[0] = time.time()
+            if update_summary and FLAGS.logs:
+                model.write_summary(sess)
+
+    # Start training threads
+    train_threads = []
+    for _ in range(FLAGS.num_concurrent_steps):
+        t = threading.Thread(target=TrainingFn)
+        train_threads.append(t)
+        t.start()
+
+    # Wait for threads to finish.
+    for t in train_threads:
+        t.join()
+
+    coord.request_stop()
+    coord.join(threads)
+
+    # Write out vectors
+    write_embeddings_to_disk(FLAGS, model, sess)
+
+    # Shutdown
+    sess.close()
+    log("Elapsed: %s", time.time() - start_time)
+
+
+if __name__ == "__main__":
+    tf.app.run()
diff --git a/sourced/ml/core/algorithms/tf_idf.py b/sourced/ml/core/algorithms/tf_idf.py
new file mode 100644
index 0000000..7cbc21a
--- /dev/null
+++ b/sourced/ml/core/algorithms/tf_idf.py
@@ -0,0 +1,5 @@
+import numpy
+
+
+def log_tf_log_idf(tf, df, ndocs):
+    return numpy.log(1 + tf) * numpy.log(ndocs / df)
diff --git a/sourced/ml/core/algorithms/token_parser.py b/sourced/ml/core/algorithms/token_parser.py
new file mode 100644
index 0000000..fb26f1f
--- /dev/null
+++ b/sourced/ml/core/algorithms/token_parser.py
@@ -0,0 +1,135 @@
+import re
+
+import Stemmer
+
+
+class TokenParser:
+    """
+    Common utilities for splitting and stemming tokens.
+    """
+    NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+")  #: Regexp to split source code identifiers.
+    STEM_THRESHOLD = 6  #: We do not stem split parts shorter than or equal to this size.
+    MAX_TOKEN_LENGTH = 256  #: We cut identifiers longer than this value.
+    MIN_SPLIT_LENGTH = 3  #: We do not split source code identifiers shorter than this value.
+    DEFAULT_SINGLE_SHOT = False  #: True if we do not want to join small identifiers to next one.
+    # Example: 'sourced.ml.algorithms' -> ["sourc", "sourcedml", "algorithm", "mlalgorithm"].
+    # if True we have only ["sourc", "algorithm"].
+    # if you do not want to filter small tokens set min_split_length=1.
+
+    def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH,
+                 min_split_length=MIN_SPLIT_LENGTH, single_shot=DEFAULT_SINGLE_SHOT):
+        self._stemmer = Stemmer.Stemmer("english")
+        self._stemmer.maxCacheSize = 0
+        self._stem_threshold = stem_threshold
+        self._max_token_length = max_token_length
+        self._min_split_length = min_split_length
+        self._single_shot = single_shot
+
+    @property
+    def stem_threshold(self):
+        return self._stem_threshold
+
+    @stem_threshold.setter
+    def stem_threshold(self, value):
+        if not isinstance(value, int):
+            raise TypeError("stem_threshold must be an integer - got %s" % type(value))
+        if value < 1:
+            raise ValueError("stem_threshold must be greater than 0 - got %d" % value)
+        self._stem_threshold = value
+
+    @property
+    def max_token_length(self):
+        return self._max_token_length
+
+    @max_token_length.setter
+    def max_token_length(self, value):
+        if not isinstance(value, int):
+            raise TypeError("max_token_length must be an integer - got %s" % type(value))
+        if value < 1:
+            raise ValueError("max_token_length must be greater than 0 - got %d" % value)
+        self._max_token_length = value
+
+    @property
+    def min_split_length(self):
+        return self._min_split_length
+
+    @min_split_length.setter
+    def min_split_length(self, value):
+        if not isinstance(value, int):
+            raise TypeError("min_split_length must be an integer - got %s" % type(value))
+        if value < 1:
+            raise ValueError("min_split_length must be greater than 0 - got %d" % value)
+        self._min_split_length = value
+
+    def __call__(self, token):
+        return self.process_token(token)
+
+    def process_token(self, token):
+        for word in self.split(token):
+            yield self.stem(word)
+
+    def stem(self, word):
+        if len(word) <= self.stem_threshold:
+            return word
+        return self._stemmer.stemWord(word)
+
+    def split(self, token):
+        token = token.strip()[:self.max_token_length]
+
+        def ret(name):
+            r = name.lower()
+            if len(name) >= self.min_split_length:
+                ret.last_subtoken = r
+                yield r
+                if ret.prev_p and not self._single_shot:
+                    yield ret.prev_p + r
+                    ret.prev_p = ""
+            elif not self._single_shot:
+                    ret.prev_p = r
+                    yield ret.last_subtoken + r
+                    ret.last_subtoken = ""
+        ret.prev_p = ""
+        ret.last_subtoken = ""
+
+        for part in self.NAME_BREAKUP_RE.split(token):
+            if not part:
+                continue
+            prev = part[0]
+            pos = 0
+            for i in range(1, len(part)):
+                this = part[i]
+                if prev.islower() and this.isupper():
+                    yield from ret(part[pos:i])
+                    pos = i
+                elif prev.isupper() and this.islower():
+                    if 0 < i - 1 - pos <= self.min_split_length:
+                        yield from ret(part[pos:i])
+                        pos = i
+                    elif i - 1 > pos:
+                        yield from ret(part[pos:i])
+                        pos = i
+                prev = this
+            last = part[pos:]
+            if last:
+                yield from ret(last)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["_stemmer"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self._stemmer = Stemmer.Stemmer("english")
+
+
+class NoopTokenParser:
+    """
+    One can use this class if he or she does not want to do any parsing.
+    """
+
+    def process_token(self, token):
+        yield token
+
+    def __call__(self, token):
+        return self.process_token(token)
diff --git a/sourced/ml/core/algorithms/uast_id_distance.py b/sourced/ml/core/algorithms/uast_id_distance.py
new file mode 100644
index 0000000..8be6fb9
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_id_distance.py
@@ -0,0 +1,122 @@
+from itertools import combinations
+from typing import Iterable, Tuple, Union
+
+import bblfsh
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag
+from sourced.ml.core.utils import bblfsh_roles
+
+
+class Uast2IdDistance(UastIds2Bag):
+    """
+    Converts a UAST to a list of identifiers pair and UAST distance between.
+    Distance metric must be defined in the inheritors.
+
+    __call__ is overridden here and return list instead of bag-of-words (dist).
+    """
+
+    DEFAULT_MAX_DISTANCE = 10  # to avoid collecting all distances we skip too big ones
+
+    def __init__(self, token2index=None, token_parser=None, max_distance=DEFAULT_MAX_DISTANCE):
+        """
+        :param token2index: The mapping from tokens to token key. If None, no mapping is performed.
+        :param token_parser: Specify token parser if you want to use a custom one. \
+            :class:'TokenParser' is used if it is not specified.
+        :param max_distance: specify to skip too distant identifiers
+        """
+        super().__init__(token2index=token2index, token_parser=token_parser)
+        self.max_distance = max_distance
+
+    def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]:
+        """
+        Converts a UAST to a list of identifiers pair and UAST distance between.
+        The tokens are preprocessed by _token_parser.
+
+        :param uast: The UAST root node.
+        :return: a list of (from identifier, to identifier) and distance pairs.
+        """
+        for point1, point2 in combinations(self._process_uast(uast), 2):
+            if point1[0] == point2[0]:
+                continue  # We do not want to calculate distance between the same identifiers
+            distance = self.distance(point1, point2)
+            if distance < self.max_distance:
+                yield ((point1[0], point2[0]) if point1[0] > point2[0] else
+                       (point2[0], point1[0])), distance
+
+    def distance(self, point1, point2) -> Union[int, float]:
+        """
+        Calculate distance between two points. A point can be anything. self._process_uast returns
+        list of points in the specific class.
+
+        :return: Distance between two points.
+        """
+        raise NotImplementedError
+
+    def _process_uast(self, node: bblfsh.Node) -> Iterable:
+        """
+        Converts uast to points list. A point can be anything you need to calculate distance.
+        """
+        raise NotImplementedError
+
+    def _process_point(self, node, info):
+        if bblfsh_roles.IDENTIFIER in node.roles and node.token:
+            for sub in self._token_parser.process_token(node.token):
+                try:
+                    yield (self._token2index[sub], info)
+                except KeyError:
+                    continue
+
+
+class Uast2IdTreeDistance(Uast2IdDistance):
+    """
+    Converts a UAST to a list of identifiers pair and UAST tree distance between.
+
+    __call__ is overridden here and return list instead of bag-of-words (dist).
+    """
+    def _process_uast(self, uast: bblfsh.Node) -> Iterable:
+        stack = [(uast, [])]
+        while stack:
+            node, ancestors = stack.pop()
+            yield from self._process_point(node, ancestors)
+            ancestors = list(ancestors)
+            ancestors.append(node)
+            stack.extend([(child, ancestors) for child in node.children])
+
+    def distance(self, point1, point2) -> int:
+        i = 0
+        ancestors1 = point1[1]
+        ancestors2 = point2[1]
+        for i, (ancestor1, ancestor2) in enumerate(zip(ancestors1, ancestors2)):  # noqa: B007
+            if ancestor1 != ancestor2:
+                break
+        distance = self.calc_tree_distance(i, len(ancestors1), len(ancestors2))
+        return distance
+
+    @staticmethod
+    def calc_tree_distance(last_common_level, level1, level2):
+        return level1 + level2 - 2 * last_common_level
+
+
+class Uast2IdLineDistance(Uast2IdDistance):
+    """
+    Converts a UAST to a list of identifiers pair and code line distance between where applicable.
+
+    __call__ is overridden here and return list instead of bag-of-words (dist).
+    """
+
+    def _process_uast(self, uast):
+        stack = [(uast, [0, 0])]
+        while stack:
+            node, last_position = stack.pop()
+            if node.start_position.line != 0:
+                # A lot of Nodes do not have position
+                # It is good heuristic to take the last Node in tree with a position.
+                last_position[0] = node.start_position.line
+                last_position[1] = 0
+            if node.start_position.col != 0:
+                last_position[1] = node.start_position.col
+            yield from self._process_point(node, last_position)
+            stack.extend([(child, list(last_position)) for child in node.children])
+
+    def distance(self, point1, point2):
+        return abs(point1[1][0] - point2[1][0])  # subtract line numbers
diff --git a/sourced/ml/core/algorithms/uast_ids_to_bag.py b/sourced/ml/core/algorithms/uast_ids_to_bag.py
new file mode 100644
index 0000000..2e02f7f
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_ids_to_bag.py
@@ -0,0 +1,110 @@
+from collections import defaultdict, deque
+
+import bblfsh
+
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser
+from sourced.ml.core.algorithms.uast_to_bag import Uast2BagBase
+from sourced.ml.core.utils import bblfsh_roles
+
+
+def uast2sequence(root):
+    sequence = []
+    nodes = defaultdict(deque)
+    stack = [root]
+    nodes[id(root)].extend(root.children)
+    while stack:
+        if nodes[id(stack[-1])]:
+            child = nodes[id(stack[-1])].popleft()
+            nodes[id(child)].extend(child.children)
+            stack.append(child)
+        else:
+            sequence.append(stack.pop())
+    return sequence
+
+
+class FakeVocabulary:
+    # FIXME(zurk): change to simple function. Vadim Markovtsev comments:
+    # > would rather made this a simple function and change roles2index
+    # type from [] to callable. Saves time to understand.
+    def __getitem__(self, item):
+        return item
+
+
+class UastTokens2Bag(Uast2BagBase):
+    """
+    Converts a UAST to a weighed bag of tokens via xpath.
+    """
+
+    XPATH = None  # Should be overridden in child class
+
+    def __init__(self, token2index=None, token_parser=None):
+        """
+        :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
+        :param token_parser: Specify token parser if you want to use a custom one. \
+            :class:'NoopTokenParser' is used if it is not specified.
+        """
+        self._token2index = FakeVocabulary() if token2index is None else token2index
+        self._token_parser = NoopTokenParser() if token_parser is None else token_parser
+
+    @property
+    def token_parser(self):
+        return self._token_parser
+
+    @property
+    def token2index(self):
+        return self._token2index
+
+    def __call__(self, uast):
+        """
+        Converts a UAST to a weighed bag-of-words. The weights are words frequencies.
+        The tokens are preprocessed by _token_parser.
+
+        :param uast: The UAST root node.
+        :return:
+        """
+        nodes = bblfsh.filter(uast, self.XPATH)
+        bag = defaultdict(int)
+        for node in nodes:
+            for sub in self._token_parser.process_token(node.token):
+                try:
+                    bag[self._token2index[sub]] += 1
+                except KeyError:
+                    continue
+        return bag
+
+
+class UastIds2Bag(UastTokens2Bag):
+    """
+    Converts a UAST to a bag-of-identifiers.
+    """
+
+    XPATH = "//*[@roleIdentifier]"
+
+    def __init__(self, token2index=None, token_parser=None):
+        """
+        :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
+        :param token_parser: Specify token parser if you want to use a custom one. \
+            :class:'TokenParser' is used if it is not specified.
+        """
+        token_parser = TokenParser() if token_parser is None else token_parser
+        super().__init__(token2index, token_parser)
+
+    def __call__(self, uast):
+        """
+        HOTFIX for https://github.com/bblfsh/client-python/issues/92
+        Converts a UAST to a weighed bag-of-identifiers. The weights are identifiers frequencies.
+        The tokens are preprocessed by _token_parser.
+        Overwrite __call__ to avoid issues with `bblfsh.filter`.
+
+        :param uast: The UAST root node.
+        :return: bag
+        """
+        nodes = [node for node in uast2sequence(uast) if bblfsh_roles.IDENTIFIER in node.roles]
+        bag = defaultdict(int)
+        for node in nodes:
+            for sub in self._token_parser.process_token(node.token):
+                try:
+                    bag[self._token2index[sub]] += 1
+                except KeyError:
+                    continue
+        return bag
diff --git a/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py b/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py
new file mode 100644
index 0000000..d089756
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py
@@ -0,0 +1,59 @@
+from collections import defaultdict
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import Uast2BagBase
+from sourced.ml.core.algorithms.uast_struct_to_bag import Node
+
+
+class Uast2GraphletBag(Uast2BagBase):
+    """
+    Converts a UAST to a bag of graphlets.
+    The graphlet of a UAST node is composed from the node itself, its parent and its children.
+    Each node is represented by the internal role string.
+    """
+    @staticmethod
+    def _extract_node(node, parent):
+        return Node(parent=parent, internal_type=node.internal_type)
+
+    def uast2graphlets(self, uast):
+        """
+        :param uast: The UAST root node.
+        :generate: The nodes which compose the UAST.
+            :class: 'Node' is used to access the nodes of the graphlets.
+        """
+        root = self._extract_node(uast, None)
+        stack = [(root, uast)]
+        while stack:
+            parent, parent_uast = stack.pop()
+            children_nodes = [self._extract_node(child, parent) for child in parent_uast.children]
+            parent.children = children_nodes
+            stack.extend(zip(children_nodes, parent_uast.children))
+            yield parent
+
+    def node2key(self, node):
+        """
+        Builds the string joining internal types of all the nodes
+        in the node's graphlet in the following order:
+        parent_node_child1_child2_child3. The children are sorted by alphabetic order.
+        str format is required for BagsExtractor.
+
+        :param node: a node of UAST
+        :return: The string key of node
+        """
+        try:
+            parent_type = node.parent.internal_type
+        except AttributeError:
+            parent_type = None
+        key = [parent_type, node.internal_type]
+        key.extend(sorted(ch.internal_type for ch in node.children))
+        return "_".join(map(str, key))
+
+    def __call__(self, uast):
+        """
+        Converts a UAST to a weighed bag of graphlets. The weights are graphlets frequencies.
+        :param uast: The UAST root node.
+        :return: bag of graphlets.
+        """
+        bag = defaultdict(int)
+        for node in self.uast2graphlets(uast):
+            bag[self.node2key(node)] += 1
+        return bag
diff --git a/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py b/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py
new file mode 100644
index 0000000..5d9daee
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py
@@ -0,0 +1,64 @@
+from typing import Iterable, Tuple, Union
+
+from bblfsh import Node
+import numpy
+
+from sourced.ml.core.algorithms.uast_to_bag import Uast2BagThroughSingleScan
+
+
+class Uast2QuantizedChildren(Uast2BagThroughSingleScan):
+    """
+    Converts a UAST to a bag of children counts.
+    """
+
+    def __init__(self, npartitions: int = 20):
+        self.npartitions = npartitions
+        self.levels = {}
+
+    def node2key(self, node: Node) -> Union[str, Tuple[str, int]]:
+        """
+        :param node: a node in UAST.
+        :return: The string which consists of the internal type of the node and its number of
+        children.
+        """
+        if not self.levels:
+            return node.internal_type, len(node.children)
+        qm = self.levels[node.internal_type]
+        quant_index = numpy.searchsorted(qm, len(node.children), side="right") - 1
+        return "%s_%d" % (node.internal_type, quant_index)
+
+    def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]):
+        for key, vals in frequencies:
+            self.levels[key] = self.quantize_unwrapped(vals)
+
+    def quantize_unwrapped(self, children_freq: Iterable[Tuple[int, int]]) -> numpy.ndarray:
+        """
+        Builds the quantization partition P that is a vector of length nb_partitions \
+        whose entries are in strictly ascending order.
+        Quantization of x is defined as:
+            0 if x <= P[0]
+            m if P[m-1] < x <= P[m]
+            n if P[n] <= x
+
+        :param children_freq: distribution of the number of children.
+        :return: The array with quantization levels.
+        """
+        levels = numpy.zeros(self.npartitions + 1, dtype=numpy.int32)
+        children_freq = sorted(children_freq)
+        max_nodes_per_bin = sum(i[1] for i in children_freq) / self.npartitions
+        levels[0] = children_freq[0][0]
+        accum = children_freq[0][1]
+        i = 1
+        for v, f in children_freq[1:]:
+            accum += f
+            if accum > max_nodes_per_bin:
+                accum = f
+                if i < len(levels):
+                    levels[i] = v
+                    i += 1
+        last = children_freq[-1][0]
+        if i < len(levels):
+            levels[i:] = last
+        else:
+            levels[-1] = last
+        return levels
diff --git a/sourced/ml/core/algorithms/uast_struct_to_bag.py b/sourced/ml/core/algorithms/uast_struct_to_bag.py
new file mode 100644
index 0000000..5ee4ce1
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_struct_to_bag.py
@@ -0,0 +1,180 @@
+from collections import defaultdict
+import random
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import FakeVocabulary, Uast2BagBase, uast2sequence
+
+
+class Uast2StructBagBase(Uast2BagBase):
+    SEP = ">"
+
+    def __init__(self, stride, seq_len, node2index=None):
+        self._node2index = node2index if node2index is not None else FakeVocabulary()
+        self._stride = stride
+        if not isinstance(seq_len, (int, tuple, list)):
+            raise TypeError("Unexpected type of seq_len: %s" % type(seq_len))
+        self._seq_lens = [seq_len] if isinstance(seq_len, int) else seq_len
+
+    @property
+    def node2index(self):
+        return self._node2index
+
+
+class Node2InternalType:
+    # FIXME(zurk): change to simple function. Vadim Markovtsev comments:
+    # > would rather made this a simple function and change roles2index
+    # type from [] to callable. Saves time to understand.
+    def __getitem__(self, item):
+        return item.internal_type
+
+
+class UastSeq2Bag(Uast2StructBagBase):
+    """
+    DFS traversal + preserves the order of node children.
+    """
+
+    def __init__(self, stride=1, seq_len=(3, 4), node2index=None):
+        _node2index = Node2InternalType() if node2index is None else node2index
+        super().__init__(stride, seq_len, _node2index)
+
+    def __call__(self, uast):
+        bag = defaultdict(int)
+        node_sequence = uast2sequence(uast)
+
+        # convert to str - requirement from wmhash.BagsExtractor
+        node_sequence = [self.node2index[n] for n in node_sequence]
+
+        for seq_len in self._seq_lens:
+            for i in range(0, len(node_sequence) - seq_len + 1, self._stride):
+                key = self.SEP.join(node_sequence[i:i + seq_len])
+                bag[key] += 1
+        return bag
+
+
+class Node:
+    def __init__(self, parent=None, internal_type=None):
+        self.parent = parent
+        self.internal_type = internal_type
+        self.children = []
+
+    @property
+    def neighbours(self):
+        neighbours = []
+        if self.parent is not None:
+            neighbours.append(self.parent)
+        neighbours.extend(self.children)
+        return neighbours
+
+
+class Uast2RandomWalks:
+    """
+    Generation of random walks for UAST.
+    """
+
+    def __init__(self, p_explore_neighborhood, q_leave_neighborhood, n_walks, n_steps,
+                 node2index=None, seed=None):
+        """
+        Related article: https://arxiv.org/abs/1607.00653
+
+        :param p_explore_neighborhood: return parameter, p. Parameter p controls the likelihood of\
+                                       immediately revisiting a node in the walk. Setting it to a\
+                                       high value (> max(q, 1)) ensures that we are less likely to\
+                                       sample an already visited node in the following two steps\
+                                       (unless the next node in the walk had no other neighbor).\
+                                       This strategy encourages moderate exploration and avoids\
+                                       2-hop redundancy in sampling.
+        :param q_leave_neighborhood: in-out parameter, q. Parameter q allows the search to\
+                                     differentiate between “inward” and “outward” nodes. Such \
+                                     walks obtain a local view of the underlying graph with \
+                                     respect to the start node in the walk and approximate BFS \
+                                     behavior in the sense that our samples comprise of nodes \
+                                     within a small locality.
+        :param n_walks: Number of walks from each node.
+        :param n_steps: Number of steps in walk.
+        :param node2index: Specify node2index transformation. Node2InternalType() is used as \
+                           default.
+        :param seed: Random seed.
+        """
+        self.p_explore_neighborhood = p_explore_neighborhood
+        self.q_leave_neighborhood = q_leave_neighborhood
+        self.n_walks = n_walks
+        self.n_steps = n_steps
+        self.node2index = node2index if node2index is not None else Node2InternalType()
+        if seed is not None:
+            random.seed(seed)
+
+    def __call__(self, uast):
+        starting_nodes = self.prepare_starting_nodes(uast)
+        for _ in range(self.n_walks):
+            for start_node in starting_nodes:
+                yield self.random_walk(start_node)
+
+    @staticmethod
+    def _extract_node(node, parent):
+        return Node(parent=parent, internal_type=node.internal_type)
+
+    def prepare_starting_nodes(self, uast):
+        starting_nodes = []
+        root = self._extract_node(uast, None)
+        stack = [(root, uast)]
+        while stack:
+            parent, parent_uast = stack.pop()
+            children_nodes = [self._extract_node(child, parent) for child in parent_uast.children]
+            parent.children = children_nodes
+            stack.extend(zip(children_nodes, parent_uast.children))
+            starting_nodes.append(parent)
+
+        return starting_nodes
+
+    def random_walk(self, node):
+        walk = [node]
+        while len(walk) < self.n_steps:
+            walk.append(self.alias_sample(walk))
+
+        walk = [self.node2index[n] for n in walk]
+        return walk
+
+    def alias_sample(self, walk):
+        """
+        Compare to node2vec this sampling is a bit simpler because there is no loop in tree ->
+        so there are only 2 options with unnormalized probabilities 1/p & 1/q
+        Related article: https://arxiv.org/abs/1607.00653
+
+        :param walk: list of visited nodes
+        :return: next node to visit
+        """
+        last_node = walk[-1]  # correspond to node v in article
+
+        if len(walk) == 1:
+            choice_list = last_node.children
+            if last_node.parent is not None:
+                choice_list.append(last_node.parent)
+            if len(choice_list) == 0:
+                return last_node
+            return random.choice(last_node.children)
+
+        threshold = (1 / self.p_explore_neighborhood)
+        threshold /= (threshold + len(last_node.children) / self.q_leave_neighborhood)
+
+        if random.random() <= threshold:
+            # With threshold probability we need to return back to previous node.
+            return walk[-2]  # Node from previous step. Correspond to node t in article.
+
+        return random.choice(last_node.neighbours)
+
+
+class UastRandomWalk2Bag(Uast2StructBagBase):
+    def __init__(self, p_explore_neighborhood=0.79, q_leave_neighborhood=0.82, n_walks=2,
+                 n_steps=10, stride=1, seq_len=(2, 3), seed=42):
+        super().__init__(stride, seq_len)
+        self.uast2walks = Uast2RandomWalks(p_explore_neighborhood=p_explore_neighborhood,
+                                           q_leave_neighborhood=q_leave_neighborhood,
+                                           n_walks=n_walks, n_steps=n_steps, seed=seed)
+
+    def __call__(self, uast):
+        bag = defaultdict(int)
+        for walk in self.uast2walks(uast):
+            for seq_len in self._seq_lens:
+                for i in range(0, len(walk) - seq_len + 1, self._stride):
+                    # convert to str - requirement from wmhash.BagsExtractor
+                    bag[self.SEP.join(walk[i:i + seq_len])] += 1
+        return bag
diff --git a/sourced/ml/core/algorithms/uast_to_bag.py b/sourced/ml/core/algorithms/uast_to_bag.py
new file mode 100644
index 0000000..2d78ad5
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_to_bag.py
@@ -0,0 +1,34 @@
+from collections import defaultdict
+from typing import Dict
+
+from bblfsh import Node
+
+
+class Uast2BagBase:
+    """
+    Base class to convert UAST to a bag of anything.
+    """
+    def __call__(self, uast: Node):
+        """
+        Inheritors must implement this function.
+
+        :param uast: The UAST root node.
+        """
+        raise NotImplementedError
+
+
+class Uast2BagThroughSingleScan(Uast2BagBase):
+    """
+    Constructs the bag by doing a single tree traversal and turning every node into a string.
+    """
+    def __call__(self, uast: Node) -> Dict[str, int]:
+        result = defaultdict(int)
+        stack = [uast]
+        while stack:
+            node = stack.pop()
+            stack.extend(node.children)
+            result[self.node2key(node)] += 1
+        return result
+
+    def node2key(self, node) -> str:
+        raise NotImplementedError
diff --git a/sourced/ml/core/algorithms/uast_to_id_sequence.py b/sourced/ml/core/algorithms/uast_to_id_sequence.py
new file mode 100644
index 0000000..1829bd7
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_to_id_sequence.py
@@ -0,0 +1,30 @@
+from typing import Iterable
+
+import bblfsh
+
+from sourced.ml.core.algorithms.uast_id_distance import Uast2IdLineDistance
+
+
+class Uast2IdSequence(Uast2IdLineDistance):
+    """
+    Converts a UAST to a sorted sequence of identifiers.
+    Identifiers are sorted by position in code.
+    We do not change the order if positions are not present.
+
+    __call__ is overridden here and return list instead of bag-of-words (dist).
+    """
+
+    def __call__(self, uast: bblfsh.Node) -> str:
+        """
+        Converts a UAST to a sorted sequence of identifiers.
+        Identifiers are sorted by position in code.
+        We do not change the order if positions are not present.
+
+        :param uast: The UAST root node.
+        :return: string with a sequence of identifiers
+        """
+        return self.concat(id for id, pos in sorted(self._process_uast(uast), key=lambda x: x[1]))
+
+    @staticmethod
+    def concat(id_sequence: Iterable):
+        return " ".join(id_sequence)
diff --git a/sourced/ml/core/algorithms/uast_to_role_id_pairs.py b/sourced/ml/core/algorithms/uast_to_role_id_pairs.py
new file mode 100644
index 0000000..08a81d7
--- /dev/null
+++ b/sourced/ml/core/algorithms/uast_to_role_id_pairs.py
@@ -0,0 +1,69 @@
+from typing import Iterable, Tuple
+
+import bblfsh
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag
+from sourced.ml.core.utils import bblfsh_roles
+
+
+class Uast2RoleIdPairs(UastIds2Bag):
+    """
+    Converts a UAST to a list of pairs. Pair is identifier and role, where role is Node role
+    where identifier was found.
+
+    __call__ is overridden here and returns list instead of bag-of-words (dist).
+    """
+
+    def __init__(self, token2index=None, token_parser=None):
+        """
+        :param token2index: The mapping from tokens to token key. If None, no mapping is performed.
+        :param token_parser: Specify token parser if you want to use a custom one. \
+            :class:'TokenParser' is used if it is not specified.
+
+        """
+        super().__init__(token2index=token2index, token_parser=token_parser)
+        self.exclude_roles = {
+            bblfsh_roles.EXPRESSION,
+            bblfsh_roles.IDENTIFIER,
+            bblfsh_roles.LEFT,
+            bblfsh_roles.QUALIFIED,
+            bblfsh_roles.BINARY,
+            bblfsh_roles.ASSIGNMENT,
+        }
+
+    def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str]]:
+        """
+        Converts a UAST to a list of identifier, role pairs.
+        The tokens are preprocessed by _token_parser.
+
+        :param uast: The UAST root node.
+        :return: a list of identifier, role pairs.
+        """
+        yield from self._process_uast(uast, [])
+
+    def _process_uast(self, uast: bblfsh.Node, ancestors):
+        stack = [(uast, [])]
+        while stack:
+            node, ancestors = stack.pop()
+
+            if bblfsh_roles.IDENTIFIER in node.roles and node.token:
+                roles = set(node.roles)
+                indx = -1
+                # We skip all Nodes with roles from `self.exclude_roles` set.
+                # We skip any Node with OPERATOR role.
+                # For them we take first parent Node from stack with another Role set.
+                while not (roles - self.exclude_roles and bblfsh_roles.OPERATOR not in roles):
+                    roles = set(ancestors[indx].roles)
+                    indx -= 1
+                for sub in self._token_parser.process_token(node.token):
+                    try:
+                        yield (self._token2index[sub], self.merge_roles(roles))
+                    except KeyError:
+                        continue
+            ancestors = list(ancestors)
+            ancestors.append(node)
+            stack.extend([(child, ancestors) for child in node.children])
+
+    @staticmethod
+    def merge_roles(roles: Iterable[int]):
+        return " | ".join(bblfsh.role_name(r) for r in sorted(roles))
diff --git a/sourced/ml/core/extractors/__init__.py b/sourced/ml/core/extractors/__init__.py
new file mode 100644
index 0000000..2d6f137
--- /dev/null
+++ b/sourced/ml/core/extractors/__init__.py
@@ -0,0 +1,12 @@
+# flake8: noqa
+from sourced.ml.core.extractors.helpers import __extractors__, get_names_from_kwargs, \
+    register_extractor, filter_kwargs, create_extractors_from_args
+from sourced.ml.core.extractors.bags_extractor import Extractor, BagsExtractor, RoleIdsExtractor
+from sourced.ml.core.extractors.identifiers import IdentifiersBagExtractor
+from sourced.ml.core.extractors.literals import LiteralsBagExtractor
+from sourced.ml.core.extractors.uast_random_walk import UastRandomWalkBagExtractor
+from sourced.ml.core.extractors.uast_seq import UastSeqBagExtractor
+from sourced.ml.core.extractors.children import ChildrenBagExtractor
+from sourced.ml.core.extractors.graphlets import GraphletBagExtractor
+from sourced.ml.core.extractors.identifier_distance import IdentifierDistance
+from sourced.ml.core.extractors.id_sequence import IdSequenceExtractor
diff --git a/sourced/ml/core/extractors/bags_extractor.py b/sourced/ml/core/extractors/bags_extractor.py
new file mode 100644
index 0000000..cf6637f
--- /dev/null
+++ b/sourced/ml/core/extractors/bags_extractor.py
@@ -0,0 +1,95 @@
+import bblfsh
+
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.algorithms.uast_to_role_id_pairs import Uast2RoleIdPairs
+from sourced.ml.core.utils.pickleable_logger import PickleableLogger
+
+
+class Extractor(PickleableLogger):
+    """
+    Converts a single UAST via `algorithm` to anything you need.
+    It is a wrapper to use in `Uast2Features` Transformer in a pipeline.
+    """
+    NAME = None  # feature scheme name, should be overridden in the derived class.
+    ALGORITHM = None  # algorithm class to extract from UAST
+    OPTS = {}  # cmdline args which are passed into __init__()
+
+    def _get_log_name(self):
+        return type(self).__name__
+
+    @classmethod
+    def get_kwargs_fromcmdline(cls, args):
+        prefix = cls.NAME + "_"
+        result = {}
+        for k, v in args.__dict__.items():
+            if k.startswith(prefix):
+                result[k[len(prefix):]] = v
+        return result
+
+    def extract(self, uast: bblfsh.Node):
+        yield from self.ALGORITHM(uast)
+
+
+class BagsExtractor(Extractor):
+    """
+    Converts a single UAST into the weighted set (dictionary), where elements are strings
+    and the values are floats. The derived classes must implement uast_to_bag().
+    """
+    DEFAULT_DOCFREQ_THRESHOLD = 5
+    NAMESPACE = None  # the beginning of each element in the bag
+    OPTS = {"weight": 1}  # cmdline args which are passed into __init__()
+
+    def __init__(self, docfreq_threshold=None, weight=None, **kwargs):
+        """
+        :param docfreq_threshold: The minimum number of occurrences of an element to be included \
+                                  into the bag
+        :param weight: TF-IDF will be multiplied by this weight to change importance of specific \
+                      bag extractor
+        :param kwargs: Parameters for parent constructor.
+        """
+        super().__init__(**kwargs)
+        if docfreq_threshold is None:
+            docfreq_threshold = self.DEFAULT_DOCFREQ_THRESHOLD
+        self.docfreq_threshold = docfreq_threshold
+        self.docfreq = {}
+        self._ndocs = 0
+        if weight is None:
+            self.weight = 1
+        else:
+            self.weight = weight
+
+    @property
+    def docfreq_threhold(self):
+        return self._docfreq_threshold
+
+    @docfreq_threhold.setter
+    def docfreq_threshold(self, value):
+        if not isinstance(value, int):
+            raise TypeError("docfreq_threshold must be an integer, got %s" % type(value))
+        if value < 1:
+            raise ValueError("docfreq_threshold must be >= 1, got %d" % value)
+        self._docfreq_threshold = value
+
+    @property
+    def ndocs(self):
+        return self._ndocs
+
+    @ndocs.setter
+    def ndocs(self, value):
+        if not isinstance(value, int):
+            raise TypeError("ndocs must be an integer, got %s" % type(value))
+        if value < 1:
+            raise ValueError("ndocs must be >= 1, got %d" % value)
+        self._ndocs = value
+
+    def extract(self, uast):
+        for key, val in self.uast_to_bag(uast).items():
+            yield self.NAMESPACE + key, val * self.weight
+
+    def uast_to_bag(self, uast):
+        raise NotImplementedError
+
+
+class RoleIdsExtractor(Extractor):
+    NAME = "roleids"
+    ALGORITHM = Uast2RoleIdPairs(token_parser=NoopTokenParser())
diff --git a/sourced/ml/core/extractors/children.py b/sourced/ml/core/extractors/children.py
new file mode 100644
index 0000000..6f2b7ff
--- /dev/null
+++ b/sourced/ml/core/extractors/children.py
@@ -0,0 +1,49 @@
+import logging
+from typing import Iterable, Tuple
+
+from sourced.ml.core.algorithms.uast_inttypes_to_nodes import Uast2QuantizedChildren
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+from sourced.ml.core.extractors.helpers import (filter_kwargs, get_names_from_kwargs,
+                                                register_extractor)
+
+
+@register_extractor
+class ChildrenBagExtractor(BagsExtractor):
+    """
+    Converts a UAST to the bag of pairs (internal type, quantized number of children).
+    """
+    NAME = "children"
+    NAMESPACE = "c."
+    OPTS = dict(get_names_from_kwargs(Uast2QuantizedChildren.__init__))
+
+    def __init__(self, docfreq_threshold=None, **kwargs):
+        original_kwargs = kwargs
+        uast2bag_kwargs = filter_kwargs(kwargs, Uast2QuantizedChildren.__init__)
+        for k in uast2bag_kwargs:
+            kwargs.pop(k)
+        super().__init__(docfreq_threshold, **kwargs)
+        self._log.debug("__init__ %s", original_kwargs)
+        self.uast_to_bag = Uast2QuantizedChildren(**uast2bag_kwargs)
+
+    @property
+    def npartitions(self):
+        return self.uast_to_bag.npartitions
+
+    @property
+    def levels(self):
+        return self.uast_to_bag.levels
+
+    def extract(self, uast):
+        if not self.uast_to_bag.levels:
+            # bypass NAMESPACE
+            gen = self.uast_to_bag(uast).items()
+        else:
+            gen = super().extract(uast)
+        for key, val in gen:
+            yield key, val
+
+    def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]):
+        self.uast_to_bag.quantize(frequencies)
+        if self._log.isEnabledFor(logging.DEBUG):
+            for k, v in self.uast_to_bag.levels.items():
+                self._log.debug("%s\n%s", k, v)
diff --git a/sourced/ml/core/extractors/graphlets.py b/sourced/ml/core/extractors/graphlets.py
new file mode 100644
index 0000000..bb9cf6b
--- /dev/null
+++ b/sourced/ml/core/extractors/graphlets.py
@@ -0,0 +1,25 @@
+from sourced.ml.core.algorithms.uast_inttypes_to_graphlets import Uast2GraphletBag
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+from sourced.ml.core.extractors.helpers import \
+    (filter_kwargs, get_names_from_kwargs, register_extractor)
+
+
+@register_extractor
+class GraphletBagExtractor(BagsExtractor):
+    NAME = "graphlet"
+    NAMESPACE = "g."
+    OPTS = dict(get_names_from_kwargs(Uast2GraphletBag.__init__))
+    OPTS.update(BagsExtractor.OPTS)
+
+    def __init__(self, docfreq_threshold=None, **kwargs):
+        original_kwargs = kwargs
+        uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__)
+        for k in uast2bag_kwargs:
+            kwargs.pop(k)
+        super().__init__(docfreq_threshold, **kwargs)
+        self._log.debug("__init__ %s", original_kwargs)
+        uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__)
+        self.uast2bag = Uast2GraphletBag(**uast2bag_kwargs)
+
+    def uast_to_bag(self, uast):
+        return self.uast2bag(uast)
diff --git a/sourced/ml/core/extractors/helpers.py b/sourced/ml/core/extractors/helpers.py
new file mode 100644
index 0000000..885c7d5
--- /dev/null
+++ b/sourced/ml/core/extractors/helpers.py
@@ -0,0 +1,32 @@
+import argparse
+import inspect
+from typing import List
+
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+
+__extractors__ = {}
+
+
+def register_extractor(cls):
+    if not issubclass(cls, BagsExtractor):
+        raise TypeError("%s is not an instance of %s" % (cls.__name__, BagsExtractor.__name__))
+    __extractors__[cls.NAME] = cls
+    return cls
+
+
+def get_names_from_kwargs(f):
+    for k, v in inspect.signature(f).parameters.items():
+        if v.default != inspect.Parameter.empty and isinstance(
+                v.default, (str, int, float, tuple)):
+            yield k.replace("_", "-"), v.default
+
+
+def filter_kwargs(kwargs, func):
+    func_param = inspect.signature(func).parameters.keys()
+    return {k: v for k, v in kwargs.items() if k in func_param}
+
+
+def create_extractors_from_args(args: argparse.Namespace) -> List[BagsExtractor]:
+    return [__extractors__[s](args.min_docfreq, log_level=args.log_level,
+                              **__extractors__[s].get_kwargs_fromcmdline(args))
+            for s in args.feature]
diff --git a/sourced/ml/core/extractors/id_sequence.py b/sourced/ml/core/extractors/id_sequence.py
new file mode 100644
index 0000000..b793d87
--- /dev/null
+++ b/sourced/ml/core/extractors/id_sequence.py
@@ -0,0 +1,32 @@
+from typing import Iterable
+
+import bblfsh
+
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.algorithms.uast_to_id_sequence import Uast2IdSequence
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+
+
+class IdSequenceExtractor(BagsExtractor):
+    """
+    Extractor wrapper for Uast2RoleIdPairs algorithm.
+    Note that this is unusual BagsExtractor since it returns iterable instead of bag.
+
+    The class did not wrap with @register_extractor because it does not produce bags as others do.
+    So nobody outside code will see it or use it directly.
+    For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want.
+
+    TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor),
+    re-inherit this class from Extractor, delete explanations from docstring.
+    """
+    NAMESPACE = ""
+    NAME = "id sequence"
+    OPTS = {}
+
+    def __init__(self, split_stem=False, **kwargs):
+        super().__init__(**kwargs)
+        self.uast2id_sequence = Uast2IdSequence(
+            None, NoopTokenParser() if not split_stem else None)
+
+    def extract(self, uast: bblfsh.Node) -> Iterable[str]:
+        yield self.uast2id_sequence(uast), None
diff --git a/sourced/ml/core/extractors/identifier_distance.py b/sourced/ml/core/extractors/identifier_distance.py
new file mode 100644
index 0000000..8168913
--- /dev/null
+++ b/sourced/ml/core/extractors/identifier_distance.py
@@ -0,0 +1,49 @@
+from typing import Iterable, Tuple
+
+import bblfsh
+
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.algorithms.uast_id_distance import (Uast2IdDistance, Uast2IdLineDistance,
+                                                         Uast2IdTreeDistance)
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+
+
+class IdentifierDistance(BagsExtractor):
+    """
+    Extractor wrapper for Uast2IdTreeDistance and Uast2IdLineDistance algorithm.
+    Note that this is an unusual BagsExtractor since it returns iterable instead of bag.
+
+    The class did not wrap with @register_extractor because it does not produce bags as others do.
+    So nobody outside code will see it or use it directly.
+    For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want.
+
+    TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor),
+    re-inherit this class from Extractor, delete explanations from docstring.
+    """
+    NAMESPACE = ""
+    NAME = "Identifier distance"
+    OPTS = {}
+    DEFAULT_MAX_DISTANCE = Uast2IdDistance.DEFAULT_MAX_DISTANCE
+
+    class DistanceType:
+        Tree = "tree"
+        Line = "line"
+        All = {Tree, Line}
+
+        @staticmethod
+        def resolve(type):
+            if type == IdentifierDistance.DistanceType.Line:
+                return Uast2IdLineDistance
+            if type == IdentifierDistance.DistanceType.Tree:
+                return Uast2IdTreeDistance
+            raise ValueError("Unknown distance type: %s" % type)
+
+    def __init__(self, split_stem=False, type="tree", max_distance=DEFAULT_MAX_DISTANCE, **kwargs):
+        super().__init__(**kwargs)
+        Uast2IdDistance = self.DistanceType.resolve(type)
+        self.uast2id_distance = Uast2IdDistance(
+            token_parser=NoopTokenParser() if not split_stem else None,
+            max_distance=max_distance)
+
+    def extract(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]:
+        yield from self.uast2id_distance(uast)
diff --git a/sourced/ml/core/extractors/identifiers.py b/sourced/ml/core/extractors/identifiers.py
new file mode 100644
index 0000000..375d594
--- /dev/null
+++ b/sourced/ml/core/extractors/identifiers.py
@@ -0,0 +1,19 @@
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+from sourced.ml.core.extractors.helpers import register_extractor
+
+
+@register_extractor
+class IdentifiersBagExtractor(BagsExtractor):
+    NAME = "id"
+    NAMESPACE = "i."
+    OPTS = {"split-stem": True}
+    OPTS.update(BagsExtractor.OPTS)
+
+    def __init__(self, docfreq_threshold=None, split_stem=True, **kwargs):
+        super().__init__(docfreq_threshold, **kwargs)
+        self.id2bag = UastIds2Bag(None, NoopTokenParser() if not split_stem else None)
+
+    def uast_to_bag(self, uast):
+        return self.id2bag(uast)
diff --git a/sourced/ml/core/extractors/literals.py b/sourced/ml/core/extractors/literals.py
new file mode 100644
index 0000000..6430d9e
--- /dev/null
+++ b/sourced/ml/core/extractors/literals.py
@@ -0,0 +1,67 @@
+import codecs
+from collections import defaultdict
+import os
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import uast2sequence, UastIds2Bag
+from sourced.ml.core.extractors.bags_extractor import BagsExtractor
+from sourced.ml.core.extractors.helpers import register_extractor
+from sourced.ml.core.utils import bblfsh_roles
+
+
+class HashedTokenParser:
+    def process_token(self, token):
+        yield codecs.encode((hash(token) & 0xffffffffffffffff).to_bytes(8, "little"),
+                            "hex_codec").decode()
+
+
+class Literals2Bag(UastIds2Bag):
+    """
+    Converts a UAST to a bag-of-literals.
+    """
+
+    XPATH = "//*[@roleLiteral]"
+
+    def __init__(self, token2index=None, token_parser=None):
+        """
+        :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
+        :param token_parser: Specify token parser if you want to use a custom one. \
+            :class:'TokenParser' is used if it is not specified.
+        """
+        token_parser = HashedTokenParser() if token_parser is None else token_parser
+        super().__init__(token2index, token_parser)
+
+    def __call__(self, uast):
+        """
+        HOTFIX for https://github.com/bblfsh/client-python/issues/92
+        Converts a UAST to a weighed bag-of-literals. The weights are literals frequencies.
+        The tokens are preprocessed by _token_parser.
+        Overwrite __call__ to avoid issues with `bblfsh.filter`.
+
+        :param uast: The UAST root node.
+        :return: bag
+        """
+        nodes = [node for node in uast2sequence(uast) if bblfsh_roles.LITERAL in node.roles]
+        bag = defaultdict(int)
+        for node in nodes:
+            for sub in self._token_parser.process_token(node.token):
+                try:
+                    bag[self._token2index[sub]] += 1
+                except KeyError:
+                    continue
+        return bag
+
+
+@register_extractor
+class LiteralsBagExtractor(BagsExtractor):
+    NAME = "lit"
+    NAMESPACE = "l."
+    OPTS = BagsExtractor.OPTS.copy()
+
+    def __init__(self, docfreq_threshold=None, **kwargs):
+        super().__init__(docfreq_threshold, **kwargs)
+        self.id2bag = Literals2Bag(None, HashedTokenParser())
+
+    def uast_to_bag(self, uast):
+        if os.getenv("PYTHONHASHSEED", "random") == "random":
+            raise RuntimeError("PYTHONHASHSEED must be set")
+        return self.id2bag(uast)
diff --git a/sourced/ml/core/extractors/uast_random_walk.py b/sourced/ml/core/extractors/uast_random_walk.py
new file mode 100644
index 0000000..b4db69e
--- /dev/null
+++ b/sourced/ml/core/extractors/uast_random_walk.py
@@ -0,0 +1,23 @@
+from sourced.ml.core.algorithms.uast_struct_to_bag import UastRandomWalk2Bag
+from sourced.ml.core.extractors.helpers import (
+    BagsExtractor, filter_kwargs, get_names_from_kwargs, register_extractor)
+
+
+@register_extractor
+class UastRandomWalkBagExtractor(BagsExtractor):
+    NAME = "node2vec"
+    NAMESPACE = "r."
+    OPTS = dict(get_names_from_kwargs(UastRandomWalk2Bag.__init__))
+    OPTS.update(BagsExtractor.OPTS)
+
+    def __init__(self, docfreq_threshold=None, **kwargs):
+        original_kwargs = kwargs
+        uast2bag_kwargs = filter_kwargs(kwargs, UastRandomWalk2Bag.__init__)
+        for k in uast2bag_kwargs:
+            kwargs.pop(k)
+        super().__init__(docfreq_threshold, **kwargs)
+        self._log.debug("__init__ %s", original_kwargs)
+        self.uast2bag = UastRandomWalk2Bag(**uast2bag_kwargs)
+
+    def uast_to_bag(self, uast):
+        return self.uast2bag(uast)
diff --git a/sourced/ml/core/extractors/uast_seq.py b/sourced/ml/core/extractors/uast_seq.py
new file mode 100644
index 0000000..5f02138
--- /dev/null
+++ b/sourced/ml/core/extractors/uast_seq.py
@@ -0,0 +1,23 @@
+from sourced.ml.core.algorithms.uast_struct_to_bag import UastSeq2Bag
+from sourced.ml.core.extractors.helpers import (
+    BagsExtractor, filter_kwargs, get_names_from_kwargs, register_extractor)
+
+
+@register_extractor
+class UastSeqBagExtractor(BagsExtractor):
+    NAME = "uast2seq"
+    NAMESPACE = "s."
+    OPTS = dict(get_names_from_kwargs(UastSeq2Bag.__init__))
+    OPTS.update(BagsExtractor.OPTS)
+
+    def __init__(self, docfreq_threshold=None, **kwargs):
+        original_kwargs = kwargs
+        uast2bag_kwargs = filter_kwargs(kwargs, UastSeq2Bag.__init__)
+        for k in uast2bag_kwargs:
+            kwargs.pop(k)
+        super().__init__(docfreq_threshold, **kwargs)
+        self._log.debug("__init__ %s", original_kwargs)
+        self.uast2bag = UastSeq2Bag(**uast2bag_kwargs)
+
+    def uast_to_bag(self, uast):
+        return self.uast2bag(uast)
diff --git a/sourced/ml/core/modelforgecfg.py b/sourced/ml/core/modelforgecfg.py
new file mode 100644
index 0000000..5148c1f
--- /dev/null
+++ b/sourced/ml/core/modelforgecfg.py
@@ -0,0 +1,8 @@
+import os
+
+
+VENDOR = "source{d}"
+BACKEND = "gcs"
+BACKEND_ARGS = "bucket=models.cdn.sourced.tech"
+INDEX_REPO = "https://github.com/src-d/models"
+CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "source{d}")
diff --git a/sourced/ml/core/models/__init__.py b/sourced/ml/core/models/__init__.py
new file mode 100644
index 0000000..5edb673
--- /dev/null
+++ b/sourced/ml/core/models/__init__.py
@@ -0,0 +1,12 @@
+# flake8: noqa
+from sourced.ml.core.models.bow import BOW
+from sourced.ml.core.models.coocc import Cooccurrences
+from sourced.ml.core.models.df import DocumentFrequencies
+from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies
+from sourced.ml.core.models.id2vec import Id2Vec
+from sourced.ml.core.models.tensorflow import TensorFlowModel
+from sourced.ml.core.models.topics import Topics
+from sourced.ml.core.models.quant import QuantizationLevels
+
+from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq
+from sourced.ml.core.models.model_converters.merge_bow import MergeBOW
diff --git a/sourced/ml/core/models/bow.py b/sourced/ml/core/models/bow.py
new file mode 100644
index 0000000..86f5447
--- /dev/null
+++ b/sourced/ml/core/models/bow.py
@@ -0,0 +1,131 @@
+import logging
+from typing import Dict, Iterable, List
+
+from modelforge import assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, \
+    register_model, split_strings
+from modelforge.progress_bar import progress_bar
+from scipy import sparse
+
+from sourced.ml.core.models.df import DocumentFrequencies
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class BOW(Model):
+    """
+    Weighted bag of words model. Every word is correspond to an index and its matrix column.
+    Bag is a word set from repository, file or anything else.
+    Word is source code identifier or its part.
+    This model depends on :class:`sourced.ml.models.DocumentFrequencies`.
+    """
+    NAME = "bow"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that contains source code as weighted bag of words."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, documents: List[str], tokens: List[str], matrix: sparse.spmatrix):
+        if matrix.shape[0] != len(documents):
+            raise ValueError("matrix shape mismatch, documents %d != %d" % (
+                matrix.shape[0], len(documents)))
+        if matrix.shape[1] != len(tokens):
+            raise ValueError("matrix shape mismatch, tokens %d != %d" % (
+                matrix.shape[1], len(tokens)))
+        self._documents = documents
+        self._matrix = matrix
+        self._tokens = tokens
+        return self
+
+    def dump(self):
+        return "Shape: %s\n" \
+               "First 10 documents: %s\n" \
+               "First 10 tokens: %s" % \
+               (self._matrix.shape, self._documents[:10], self.tokens[:10])
+
+    @property
+    def matrix(self) -> sparse.spmatrix:
+        """
+        Returns the bags as a sparse matrix. Rows are documents and columns are tokens weight.
+        """
+        return self._matrix
+
+    @property
+    def documents(self):
+        """
+        The list of documents in the model.
+        """
+        return self._documents
+
+    @property
+    def tokens(self):
+        """
+        The list of tokens in the model.
+        """
+        return self._tokens
+
+    def __getitem__(self, item: int):
+        """
+        Returns document name, word indices and weights for the given document index.
+
+        :param item: Document index.
+        :return: (name, :class:`numpy.ndarray` with word indices, \
+                  :class:`numpy.ndarray` with weights)
+        """
+        data = self._matrix[item]
+        return self._documents[item], data.indices, data.data
+
+    def __iter__(self):
+        """
+        Returns an iterator over the document indices.
+        """
+        return iter(range(len(self)))
+
+    def __len__(self):
+        """
+        Returns the number of documents.
+        """
+        return len(self._documents)
+
+    def save(self, output: str, series: str, deps: Iterable = tuple(),
+             create_missing_dirs: bool = True):
+        if not deps:
+            try:
+                deps = [self.get_dep(DocumentFrequencies.NAME)]
+            except KeyError:
+                raise ValueError(
+                    "You must specify DocumentFrequencies dependency to save BOW.") from None
+        super().save(output=output, series=series, deps=deps,
+                     create_missing_dirs=create_missing_dirs)
+
+    def convert_bow_to_vw(self, output: str):
+        log = logging.getLogger("bow2vw")
+        log.info("Writing %s", output)
+        with open(output, "w") as fout:
+            for index in progress_bar(self, log, expected_size=len(self)):
+                record = self[index]
+                fout.write(record[0].replace(":", "").replace(" ", "_") + " ")
+                pairs = []
+                for t, v in zip(*record[1:]):
+                    try:
+                        word = self.tokens[t]
+                    except (KeyError, IndexError):
+                        log.warning("%d not found in the vocabulary", t)
+                        continue
+                    pairs.append("%s:%s" % (word, v))
+                fout.write(" ".join(pairs))
+                fout.write("\n")
+
+    def documents_index(self) -> Dict[str, int]:
+        return {r: i for i, r in enumerate(self._documents)}
+
+    def _generate_tree(self):
+        return {"documents": merge_strings(self._documents),
+                "matrix": disassemble_sparse_matrix(self._matrix),
+                "tokens": merge_strings(self.tokens)}
+
+    def _load_tree_kwargs(self, tree: dict):
+        return {"documents": split_strings(tree["documents"]),
+                "matrix": assemble_sparse_matrix(tree["matrix"]),
+                "tokens": split_strings(tree["tokens"])}
+
+    def _load_tree(self, tree: dict):
+        self.construct(**self._load_tree_kwargs(tree))
diff --git a/sourced/ml/core/models/coocc.py b/sourced/ml/core/models/coocc.py
new file mode 100644
index 0000000..66f2dfc
--- /dev/null
+++ b/sourced/ml/core/models/coocc.py
@@ -0,0 +1,62 @@
+from modelforge.model import (
+    assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, split_strings)
+from modelforge.models import register_model
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class Cooccurrences(Model):
+    """
+    Co-occurrence matrix.
+    """
+    NAME = "co-occurrences"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that contains the sparse co-occurrence matrix of source code identifiers."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, tokens, matrix):
+        self._tokens = tokens
+        self._matrix = matrix
+        return self
+
+    def _load_tree(self, tree):
+        self.construct(tokens=split_strings(tree["tokens"]),
+                       matrix=assemble_sparse_matrix(tree["matrix"]))
+
+    def dump(self):
+        return """Number of words: %d
+First 10 words: %s
+Matrix: shape: %s non-zero: %d""" % (
+            len(self.tokens), self.tokens[:10], self.matrix.shape, self.matrix.getnnz())
+
+    @property
+    def tokens(self):
+        """
+        Returns the tokens in the order which corresponds to the matrix's rows and cols.
+        """
+        return self._tokens
+
+    @property
+    def matrix(self):
+        """
+        Returns the sparse co-occurrence matrix.
+        """
+        return self._matrix
+
+    def __len__(self):
+        """
+        Returns the number of tokens in the model.
+        """
+        return len(self._tokens)
+
+    def _generate_tree(self):
+        return {"tokens": merge_strings(self.tokens),
+                "matrix": disassemble_sparse_matrix(self.matrix)}
+
+    def matrix_to_rdd(self, spark_context: "pyspark.SparkContext") -> "pyspark.RDD":
+        self._log.info("Convert coocc model to RDD...")
+        rdd_row = spark_context.parallelize(self._matrix.row)
+        rdd_col = spark_context.parallelize(self._matrix.col)
+        rdd_data = spark_context.parallelize(self._matrix.data)
+        return rdd_row.zip(rdd_col).zip(rdd_data)
diff --git a/sourced/ml/core/models/df.py b/sourced/ml/core/models/df.py
new file mode 100644
index 0000000..fad83d4
--- /dev/null
+++ b/sourced/ml/core/models/df.py
@@ -0,0 +1,169 @@
+from itertools import islice
+from typing import Dict, Iterable, List, Union
+
+from modelforge import merge_strings, Model, register_model, split_strings
+import numpy
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class DocumentFrequencies(Model):
+    """
+    Document frequencies - number of times a source code identifier appeared
+    in different repositories. Each repository counts only once.
+    """
+    NAME = "docfreq"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that contains document frequencies of features extracted from code."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, docs: int, tokfreqs: Union[Iterable[Dict[str, int]], Dict[str, int]]):
+        """
+        Initializes this model.
+        :param docs: The number of documents.
+        :param tokfreqs: The dictionary of token -> frequency or the iterable collection of such
+                         dictionaries.
+        :return: self
+        """
+        if isinstance(tokfreqs, dict):
+            df = tokfreqs
+        else:
+            df = {}
+            for d in tokfreqs:
+                df.update(d)
+        self._docs = docs
+        self._df = df
+        return self
+
+    """
+    WE DO NOT ADD THIS
+
+    def df(self) -> dict:
+    """
+
+    def _load_tree(self, tree: dict, tokens=None):
+        if tokens is None:
+            tokens = split_strings(tree["tokens"])
+        freqs = tree["freqs"]
+        self._log.info("Building the docfreq dictionary...")
+        tokfreq = dict(zip(tokens, freqs))
+        self.construct(docs=tree["docs"], tokfreqs=tokfreq)
+
+    def _generate_tree(self):
+        tokens = self.tokens()
+        freqs = numpy.array([self._df[t] for t in tokens], dtype=numpy.float32)
+        return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs}
+
+    def dump(self):
+        return """Number of words: %d
+Random 10 words: %s
+Number of documents: %d""" % (
+            len(self._df), dict(islice(self._df.items(), 10)), self.docs)
+
+    @property
+    def docs(self) -> int:
+        """
+        Returns the number of documents.
+        """
+        return self._docs
+
+    """
+    WE DO NOT ADD THIS
+
+    def df(self) -> dict:
+    """
+
+    def prune(self, threshold: int) -> "DocumentFrequencies":
+        """
+        Removes tokens which occur less than `threshold` times.
+        The operation happens *not* in-place - a new model is returned.
+        :param threshold: Minimum number of occurrences.
+        :return: The new model if the current one had to be changed, otherwise self.
+        """
+        if threshold < 1:
+            raise ValueError("Invalid threshold: %d" % threshold)
+        if threshold == 1:
+            return self
+        self._log.info("Pruning to min %d occurrences", threshold)
+        pruned = type(self)()
+        pruned._docs = self.docs
+        pruned._df = {k: v for k, v in self._df.items() if v >= threshold}
+        self._log.info("Size: %d -> %d", len(self), len(pruned))
+        pruned._meta = self.meta
+        return pruned
+
+    def greatest(self, max_size: int) -> "DocumentFrequencies":
+        """
+        Truncates the model to most frequent `max_size` tokens.
+        The operation happens *not* in-place - a new model is returned.
+        :param max_size: The maximum vocabulary size.
+        :return: The new model if the current one had to be changed, otherwise self.
+        """
+        if max_size < 1:
+            raise ValueError("Invalid max_size: %d" % max_size)
+        if len(self) <= max_size:
+            return self
+        self._log.info("Pruning to max %d size", max_size)
+        pruned = type(self)()
+        pruned._docs = self.docs
+        freqs = numpy.fromiter(self._df.values(), dtype=numpy.int32, count=len(self))
+        keys = numpy.array(list(self._df.keys()), dtype=object)
+        chosen = numpy.argpartition(freqs, len(freqs) - max_size)[len(freqs) - max_size:]
+        border_freq = freqs[chosen].min()
+        chosen = freqs >= border_freq
+        # argpartition can leave some of the elements with freq == border_freq outside
+        # so next step ensures that we include everything.
+        freqs = freqs[chosen]
+        keys = keys[chosen]
+        # we need to be deterministic at the cutoff frequency
+        # argpartition returns random samples every time
+        # so we treat words with the cutoff frequency separately
+        if max_size != freqs.shape[0]:
+            assert max_size < freqs.shape[0]
+            border_freq_indexes = freqs == border_freq
+            border_keys = keys[border_freq_indexes]
+            border_keys.sort()
+            border_keys = border_keys[:max_size - freqs.shape[0]]
+            df = dict(zip(keys[~border_freq_indexes], freqs[~border_freq_indexes]))
+            df.update({key: border_freq for key in border_keys})
+        else:
+            df = dict(zip(keys, freqs))
+        pruned._df = df
+        self._log.info("Size: %d -> %d", len(self), len(pruned))
+        pruned._meta = self.meta
+        return pruned
+
+    def __getitem__(self, item):
+        return self._df[item]
+
+    def __iter__(self):
+        return iter(self._df.items())
+
+    def __len__(self):
+        """
+        Returns the number of tokens in the model.
+        """
+        return len(self._df)
+
+    def get(self, item, default=None) -> Union[int, None]:
+        """
+        Return the document frequency for a given token.
+
+        :param item: The token to query.
+        :param default: Returned value in case the token is missing.
+        :return: int or `default`
+        """
+        return self._df.get(item, default)
+
+    def tokens(self) -> List[str]:
+        """
+        Returns the list of tokens.
+        """
+        return list(self._df)
+
+    """
+    WE DO NOT ADD THIS
+
+    def df(self) -> dict:
+    """
diff --git a/sourced/ml/core/models/id2vec.py b/sourced/ml/core/models/id2vec.py
new file mode 100644
index 0000000..782ae23
--- /dev/null
+++ b/sourced/ml/core/models/id2vec.py
@@ -0,0 +1,66 @@
+from modelforge import merge_strings, Model, register_model, split_strings
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class Id2Vec(Model):
+    """
+    id2vec model - source code identifier embeddings.
+    """
+    NAME = "id2vec"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that contains information on source code as identifier embeddings."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, embeddings, tokens):
+        self._embeddings = embeddings
+        self._tokens = tokens
+        self._log.info("Building the token index...")
+        self._token2index = {w: i for i, w in enumerate(self._tokens)}
+        return self
+
+    def _load_tree(self, tree):
+        self.construct(embeddings=tree["embeddings"].copy(),
+                       tokens=split_strings(tree["tokens"]))
+
+    def dump(self):
+        return """Shape: %s
+First 10 words: %s""" % (
+            self.embeddings.shape, self.tokens[:10])
+
+    @property
+    def embeddings(self):
+        """
+        :class:`numpy.ndarray` with the embeddings of shape
+        (N tokens x embedding dims).
+        """
+        return self._embeddings
+
+    @property
+    def tokens(self):
+        """
+        List with the processed source code identifiers.
+        """
+        return self._tokens
+
+    def items(self):
+        """
+        Returns the tuples belonging to token -> index mapping.
+        """
+        return self._token2index.items()
+
+    def __getitem__(self, item):
+        """
+        Returns the index of the specified processed source code identifier.
+        """
+        return self._token2index[item]
+
+    def __len__(self):
+        """
+        Returns the number of tokens in the model.
+        """
+        return len(self._tokens)
+
+    def _generate_tree(self):
+        return {"embeddings": self.embeddings, "tokens": merge_strings(self.tokens)}
diff --git a/sourced/ml/core/models/license.py b/sourced/ml/core/models/license.py
new file mode 100644
index 0000000..9ebe479
--- /dev/null
+++ b/sourced/ml/core/models/license.py
@@ -0,0 +1,3 @@
+"""Default license used for the models."""
+
+DEFAULT_LICENSE = "ODbL-1.0"
diff --git a/sourced/ml/core/models/model_converters/__init__.py b/sourced/ml/core/models/model_converters/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sourced/ml/core/models/model_converters/base.py b/sourced/ml/core/models/model_converters/base.py
new file mode 100644
index 0000000..bcc0ee2
--- /dev/null
+++ b/sourced/ml/core/models/model_converters/base.py
@@ -0,0 +1,115 @@
+import logging
+import multiprocessing
+import os
+from typing import List, Union
+
+from modelforge import Model
+from modelforge.progress_bar import progress_bar
+
+from sourced.ml.core.utils.pickleable_logger import PickleableLogger
+
+
+class Model2Base(PickleableLogger):
+    """
+    Base class for model -> model conversions.
+    """
+    MODEL_FROM_CLASS = None
+    MODEL_TO_CLASS = None
+
+    def __init__(self, num_processes: int = 0,
+                 log_level: int = logging.DEBUG, overwrite_existing: bool = True):
+        """
+        Initializes a new instance of Model2Base class.
+
+        :param num_processes: The number of processes to execute for conversion.
+        :param log_level: Logging verbosity level.
+        :param overwrite_existing: Rewrite existing models or skip them.
+        """
+        super().__init__(log_level=log_level)
+        self.num_processes = multiprocessing.cpu_count() if num_processes == 0 else num_processes
+        self.overwrite_existing = overwrite_existing
+
+    def convert(self, models_path: List[str], destdir: str) -> int:
+        """
+        Performs the model -> model conversion. Runs the conversions in a pool of processes.
+
+        :param models_path: List of Models path.
+        :param destdir: The directory where to store the models. The directory structure is \
+                        preserved.
+        :return: The number of converted files.
+        """
+        files = list(models_path)
+        self._log.info("Found %d files", len(files))
+        if not files:
+            return 0
+        queue_in = multiprocessing.Manager().Queue()
+        queue_out = multiprocessing.Manager().Queue(1)
+        processes = [multiprocessing.Process(target=self._process_entry,
+                                             args=(i, destdir, queue_in, queue_out))
+                     for i in range(self.num_processes)]
+        for p in processes:
+            p.start()
+        for f in files:
+            queue_in.put(f)
+        for _ in processes:
+            queue_in.put(None)
+        failures = 0
+        for _ in progress_bar(files, self._log, expected_size=len(files)):
+            filename, ok = queue_out.get()
+            if not ok:
+                failures += 1
+        for p in processes:
+            p.join()
+        self._log.info("Finished, %d failed files", failures)
+        return len(files) - failures
+
+    def convert_model(self, model: Model) -> Union[Model, None]:
+        """
+        This must be implemented in the child classes.
+
+        :param model: The model instance to convert.
+        :return: The converted model instance or None if it is not needed.
+        """
+        raise NotImplementedError
+
+    def finalize(self, index: int, destdir: str):
+        """
+        Called for each worker in the end of the processing.
+
+        :param index: Worker's index.
+        :param destdir: The directory where to store the models.
+        """
+        pass
+
+    def _process_entry(self, index, destdir, queue_in, queue_out):
+        while True:
+            filepath = queue_in.get()
+            if filepath is None:
+                break
+            try:
+                model_path = os.path.join(destdir, os.path.split(filepath)[1])
+                if os.path.exists(model_path):
+                    if self.overwrite_existing:
+                        self._log.warning(
+                            "Model %s already exists, but will be overwrite. If you want to "
+                            "skip existing models use --disable-overwrite flag", model_path)
+                    else:
+                        self._log.warning("Model %s already exists, skipping.", model_path)
+                        queue_out.put((filepath, True))
+                        continue
+                model_from = self.MODEL_FROM_CLASS(log_level=self._log.level).load(filepath)
+                model_to = self.convert_model(model_from)
+                if model_to is not None:
+                    dirs = os.path.dirname(model_path)
+                    if dirs:
+                        os.makedirs(dirs, exist_ok=True)
+                    model_to.save(model_path, deps=model_to.meta["dependencies"])
+            except:  # noqa
+                self._log.exception("%s failed", filepath)
+                queue_out.put((filepath, False))
+            else:
+                queue_out.put((filepath, True))
+        self.finalize(index, destdir)
+
+    def _get_log_name(self):
+        return "%s2%s" % (self.MODEL_FROM_CLASS.NAME, self.MODEL_TO_CLASS.NAME)
diff --git a/sourced/ml/core/models/model_converters/merge_bow.py b/sourced/ml/core/models/model_converters/merge_bow.py
new file mode 100644
index 0000000..305ccb1
--- /dev/null
+++ b/sourced/ml/core/models/model_converters/merge_bow.py
@@ -0,0 +1,69 @@
+import os
+
+from scipy.sparse import vstack
+
+from sourced.ml.core import extractors
+from sourced.ml.core.models.bow import BOW
+from sourced.ml.core.models.model_converters.base import Model2Base
+
+
+class MergeBOW(Model2Base):
+    """
+    Merges several :class:`BOW` models together.
+    """
+    MODEL_FROM_CLASS = BOW
+    MODEL_TO_CLASS = BOW
+
+    def __init__(self, features=None, *args, **kwargs):
+        super().__init__(num_processes=1, *args, **kwargs)
+        self.documents = None
+        self.tokens = None
+        self.matrix = None
+        self.deps = None
+        self.features_namespaces = None
+        if features:
+            self.features_namespaces = [ex.NAMESPACE for ex in extractors.__extractors__.values()
+                                        if ex.NAME in features]
+
+    def convert_model(self, model: BOW) -> None:
+        if self.tokens is None:
+            self.tokens = model.tokens
+            self.documents = model.documents
+            self.matrix = [model.matrix.tocsr()]
+            self.deps = model._meta["dependencies"]
+        elif set(self.tokens) != set(model.tokens):
+            raise ValueError("Models don't share the same set of tokens !")
+        else:
+            self.documents += model.documents
+            self.matrix.append(model.matrix.tocsr())
+
+    def finalize(self, index: int, destdir: str):
+        self._log.info("Stacking matrices ...")
+        matrix = self.matrix.pop(0)
+        while self.matrix:
+            matrix = vstack([matrix, self.matrix.pop(0)])
+            self._log.info("%s matrices to stack ...", len(self.matrix))
+        self.matrix = matrix
+        self._log.info("Writing model ...")
+        if self.features_namespaces:
+            self._reduce_matrix()
+        BOW(log_level=self._log.level) \
+            .construct(self.documents, self.tokens, self.matrix) \
+            .save(output=self._save_path(index, destdir), series="id2vec", deps=self.deps)
+
+    def _reduce_matrix(self):
+        reduced_tokens = []
+        columns = []
+        matrix = self.matrix.tocsc()
+        for i, token in enumerate(self.tokens):
+            if token.split(".")[0] in self.features_namespaces:
+                reduced_tokens.append(token)
+                columns.append(i)
+        self.tokens = reduced_tokens
+        self.matrix = matrix[:, columns]
+
+    @staticmethod
+    def _save_path(index: int, destdir: str):
+        if destdir.endswith(".asdf"):
+            return destdir
+        return os.path.join(destdir, "bow_%d.asdf" % index)
diff --git a/sourced/ml/core/models/model_converters/merge_df.py b/sourced/ml/core/models/model_converters/merge_df.py
new file mode 100644
index 0000000..68f0a58
--- /dev/null
+++ b/sourced/ml/core/models/model_converters/merge_df.py
@@ -0,0 +1,42 @@
+from collections import defaultdict
+import os
+
+from sourced.ml.core.models.df import DocumentFrequencies
+from sourced.ml.core.models.model_converters.base import Model2Base
+from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies
+
+
+class MergeDocFreq(Model2Base):
+    """
+    Merges several :class:`DocumentFrequencies` models together.
+    """
+    MODEL_FROM_CLASS = DocumentFrequencies
+    MODEL_TO_CLASS = DocumentFrequencies
+
+    def __init__(self, min_docfreq: int, vocabulary_size: int, ordered: bool = False,
+                 *args, **kwargs):
+        super().__init__(num_processes=1, *args, **kwargs)
+        self.ordered = ordered
+        self.min_docfreq = min_docfreq
+        self.vocabulary_size = vocabulary_size
+        self._df = defaultdict(int)
+        self._docs = 0
+
+    def convert_model(self, model: DocumentFrequencies) -> None:
+        for word, freq in model:
+            self._df[word] += freq
+        self._docs += model.docs
+
+    def finalize(self, index: int, destdir: str):
+        df_model = OrderedDocumentFrequencies if self.ordered else DocumentFrequencies
+        df_model(log_level=self._log.level) \
+            .construct(self._docs, self._df) \
+            .prune(self.min_docfreq) \
+            .greatest(self.vocabulary_size) \
+            .save(output=self._save_path(index, destdir), series="id2vec")
+
+    @staticmethod
+    def _save_path(index: int, destdir: str):
+        if destdir.endswith(".asdf"):
+            return destdir
+        return os.path.join(destdir, "docfreq_%d.asdf" % index)
diff --git a/sourced/ml/core/models/ordered_df.py b/sourced/ml/core/models/ordered_df.py
new file mode 100644
index 0000000..ecfd6ed
--- /dev/null
+++ b/sourced/ml/core/models/ordered_df.py
@@ -0,0 +1,60 @@
+from typing import Dict, Iterable, List
+
+from modelforge import merge_strings, register_model, split_strings
+import numpy
+
+from sourced.ml.core.models import DocumentFrequencies
+
+
+@register_model
+class OrderedDocumentFrequencies(DocumentFrequencies):
+    """
+    Compatible with the original DocumentFrequencies. This model maintains the determinitic
+    sequence of the tokens.
+    """
+    # NAME is the same
+
+    def construct(self, docs: int, tokfreqs: Iterable[Dict[str, int]]):
+        super().construct(docs, tokfreqs)
+        self._log.info("Ordering the keys...")
+        keys = sorted(self._df)
+        self._order = {k: i for i, k in enumerate(keys)}
+        return self
+
+    @property
+    def order(self) -> Dict[str, int]:
+        return self._order
+
+    def tokens(self) -> List[str]:
+        arr = [None for _ in range(len(self))]
+        for k, v in self.order.items():
+            arr[v] = k
+        return arr
+
+    def _load_tree(self, tree):
+        tokens = split_strings(tree["tokens"])
+        super()._load_tree(tree, tokens)
+        self._log.info("Mapping the keys order...")
+        self._order = {k: i for i, k in enumerate(tokens)}
+
+    def _generate_tree(self):
+        tokens = [None] * len(self)
+        freqs = numpy.zeros(len(self), dtype=numpy.float32)
+        for k, i in self._order.items():
+            tokens[i] = k
+            freqs[i] = self._df[k]
+        return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs}
+
+    def prune(self, threshold: int) -> "OrderedDocumentFrequencies":
+        pruned = super().prune(threshold)
+        if pruned is not self:
+            self._log.info("Recovering the order...")
+            pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))}
+        return pruned
+
+    def greatest(self, max_size: int) -> "OrderedDocumentFrequencies":
+        pruned = super().greatest(max_size)
+        if pruned is not self:
+            self._log.info("Recovering the order...")
+            pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))}
+        return pruned
diff --git a/sourced/ml/core/models/quant.py b/sourced/ml/core/models/quant.py
new file mode 100644
index 0000000..f56e253
--- /dev/null
+++ b/sourced/ml/core/models/quant.py
@@ -0,0 +1,62 @@
+from typing import Dict
+
+from modelforge import merge_strings, Model, register_model, split_strings
+import numpy
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class QuantizationLevels(Model):
+    """
+    This model contains quantization levels for multiple schemes (feature types).
+    Every feature "class" (type, possible distinct value) corresponds to the numpy array
+    with integer level borders. The size of each numpy array is (the number of levels + 1).
+    """
+    NAME = "quant"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that contains quantization levels for multiple schemes (feature types)."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, levels: Dict[str, Dict[str, numpy.ndarray]]):
+        self._levels = levels
+        return self
+
+    @property
+    def levels(self) -> Dict[str, Dict[str, numpy.ndarray]]:
+        return self._levels
+
+    def __len__(self):
+        return len(self.levels)
+
+    def _load_tree(self, tree):
+        self._levels = {}
+        for key, vals in tree["schemes"].items():
+            classes = split_strings(vals["classes"])
+            levels = vals["levels"]
+            self.levels[key] = dict(zip(classes, numpy.split(levels, len(classes))))
+
+    def _generate_tree(self):
+        tree = {"schemes": {}}
+        for key, vals in self.levels.items():
+            tree["schemes"][key] = scheme = {}
+            npartitions = len(next(iter(vals.values())))
+            classes = [None for _ in range(len(vals))]
+            scheme["levels"] = levels = numpy.zeros(len(vals) * npartitions, dtype=numpy.int32)
+            for i, pair in enumerate(vals.items()):
+                classes[i], levels[i * npartitions:(i + 1) * npartitions] = pair
+            scheme["classes"] = merge_strings(classes)
+        return tree
+
+    def dump(self):
+        return """Schemes: %s""" % (
+            sorted((v[0], "%d@%d" % (len(v[1]), len(next(iter(v[1].values()))) - 1))
+                   for v in self.levels.items()))
+
+    def apply_quantization(self, extractors):
+        for extractor in extractors:
+            try:
+                extractor.quantize
+            except AttributeError:
+                continue
+            extractor.uast_to_bag.levels = self._levels[extractor.NAME]
diff --git a/sourced/ml/core/models/tensorflow.py b/sourced/ml/core/models/tensorflow.py
new file mode 100644
index 0000000..a45f97b
--- /dev/null
+++ b/sourced/ml/core/models/tensorflow.py
@@ -0,0 +1,50 @@
+from typing import List
+
+from modelforge import Model, register_model
+import numpy
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class TensorFlowModel(Model):
+    """
+    TensorFlow Protobuf model exported in the Modelforge format with GraphDef inside.
+    """
+    NAME = "tensorflow-model"
+    VENDOR = "source{d}"
+    DESCRIPTION = "TensorFlow Protobuf model that contains a GraphDef instance."
+    LICENSE = DEFAULT_LICENSE
+
+    def construct(self, graphdef: "tensorflow.GraphDef" = None,  # noqa: F821
+                  session: "tensorflow.Session" = None,  # noqa: F821
+                  outputs: List[str] = None):
+        if graphdef is None:
+            assert session is not None
+            assert outputs is not None
+            graphdef = session.graph_def
+            from tensorflow.python.framework import graph_util
+            for node in graphdef.node:
+                node.device = ""
+                graphdef = graph_util.convert_variables_to_constants(
+                    session, graphdef, outputs)
+        self._graphdef = graphdef
+        return self
+
+    @property
+    def graphdef(self):
+        """
+        Returns the wrapped TensorFlow GraphDef.
+        """
+        return self._graphdef
+
+    def _generate_tree(self) -> dict:
+        return {"graphdef": numpy.frombuffer(self._graphdef.SerializeToString(),
+                                             dtype=numpy.uint8)}
+
+    def _load_tree(self, tree: dict):
+        from tensorflow.core.framework import graph_pb2
+
+        graphdef = graph_pb2.GraphDef()
+        graphdef.ParseFromString(tree["graphdef"].data)
+        self.construct(graphdef=graphdef)
diff --git a/sourced/ml/core/models/topics.py b/sourced/ml/core/models/topics.py
new file mode 100644
index 0000000..95cf873
--- /dev/null
+++ b/sourced/ml/core/models/topics.py
@@ -0,0 +1,86 @@
+from typing import Union
+
+from modelforge import assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, \
+    Model, register_model, split_strings
+
+from sourced.ml.core.models.license import DEFAULT_LICENSE
+
+
+@register_model
+class Topics(Model):
+    NAME = "topics"
+    VENDOR = "source{d}"
+    DESCRIPTION = "Model that is used to identify topics of source code repositories."
+    LICENSE = DEFAULT_LICENSE
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    @property
+    def topics(self):
+        """
+        May be None if no topics are labeled.
+        """
+        return self._topics
+
+    @property
+    def matrix(self):
+        """
+        Rows: tokens
+        Columns: topics
+        """
+        return self._matrix
+
+    def construct(self, tokens: list, topics: Union[list, None], matrix):
+        if len(tokens) != matrix.shape[1]:
+            raise ValueError("Tokens and matrix do not match.")
+        self._tokens = tokens
+        self._topics = topics
+        self._matrix = matrix
+        return self
+
+    def _load_tree(self, tree: dict) -> None:
+        self.construct(split_strings(tree["tokens"]),
+                       split_strings(tree["topics"]) if tree["topics"] else None,
+                       assemble_sparse_matrix(tree["matrix"]))
+
+    def dump(self) -> str:
+        res = "%d topics, %d tokens\nFirst 10 tokens: %s\nTopics: " % (
+            self.matrix.shape + (self.tokens[:10],))
+        if self.topics is not None:
+            res += "labeled, first 10: %s\n" % self.topics[:10]
+        else:
+            res += "unlabeled\n"
+        nnz = self.matrix.getnnz()
+        res += "non-zero elements: %d  (%f)" % (
+            nnz, nnz / (self.matrix.shape[0] * self.matrix.shape[1]))
+        return res
+
+    def _generate_tree(self):
+        return {"tokens": merge_strings(self.tokens),
+                "topics": merge_strings(self.topics) if self.topics is not None else False,
+                "matrix": disassemble_sparse_matrix(self.matrix)}
+
+    def __len__(self):
+        """
+        Returns the number of topics.
+        """
+        return self.matrix.shape[0]
+
+    def __getitem__(self, item):
+        """
+        Returns the keywords sorted by significance from topic index.
+        """
+        row = self.matrix[item]
+        nnz = row.nonzero()[1]
+        pairs = [(-row[0, i], i) for i in nnz]
+        pairs.sort()
+        return [(self.tokens[pair[1]], -pair[0]) for pair in pairs]
+
+    def label_topics(self, labels):
+        if len(labels) != len(self):
+            raise ValueError("Sizes do not match: %d != %d" % (len(labels), len(self)))
+        if not isinstance(labels[0], str):
+            raise TypeError("Labels must be strings")
+        self._topics = list(labels)
diff --git a/sourced/ml/core/tests/.gitignore b/sourced/ml/core/tests/.gitignore
new file mode 100644
index 0000000..559a840
--- /dev/null
+++ b/sourced/ml/core/tests/.gitignore
@@ -0,0 +1 @@
+swivel/shard-000-000.pb
\ No newline at end of file
diff --git a/sourced/ml/core/tests/__init__.py b/sourced/ml/core/tests/__init__.py
new file mode 100644
index 0000000..1f8dc17
--- /dev/null
+++ b/sourced/ml/core/tests/__init__.py
@@ -0,0 +1,24 @@
+import sys
+
+from modelforge import slogging
+
+
+utmain = sys.modules["__main__"]
+if utmain.__package__ == "unittest" and utmain.__spec__ is None:
+    from collections import namedtuple
+    ModuleSpec = namedtuple("ModuleSpec", ["name"])
+    utmain.__spec__ = ModuleSpec("unittest.__main__")
+    del ModuleSpec
+del utmain
+
+
+def has_tensorflow():
+    try:
+        import tensorflow  # noqa
+        return True
+    except ImportError:
+        return False
+
+
+def setup():
+    slogging.setup("INFO", False)
diff --git a/sourced/ml/core/tests/asdf/bow.asdf b/sourced/ml/core/tests/asdf/bow.asdf
new file mode 100644
index 0000000..26b8ea0
Binary files /dev/null and b/sourced/ml/core/tests/asdf/bow.asdf differ
diff --git a/sourced/ml/core/tests/asdf/coocc.asdf b/sourced/ml/core/tests/asdf/coocc.asdf
new file mode 100644
index 0000000..9498b99
Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc.asdf differ
diff --git a/sourced/ml/core/tests/asdf/coocc_df.asdf b/sourced/ml/core/tests/asdf/coocc_df.asdf
new file mode 100644
index 0000000..b40f5d7
Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc_df.asdf differ
diff --git a/sourced/ml/core/tests/asdf/docfreq_1000.asdf b/sourced/ml/core/tests/asdf/docfreq_1000.asdf
new file mode 100644
index 0000000..2fa308d
Binary files /dev/null and b/sourced/ml/core/tests/asdf/docfreq_1000.asdf differ
diff --git a/sourced/ml/core/tests/asdf/id2vec_1000.asdf b/sourced/ml/core/tests/asdf/id2vec_1000.asdf
new file mode 100644
index 0000000..d410d31
Binary files /dev/null and b/sourced/ml/core/tests/asdf/id2vec_1000.asdf differ
diff --git a/sourced/ml/core/tests/asdf/quant.asdf b/sourced/ml/core/tests/asdf/quant.asdf
new file mode 100644
index 0000000..793c4ba
Binary files /dev/null and b/sourced/ml/core/tests/asdf/quant.asdf differ
diff --git a/sourced/ml/core/tests/asdf/topics.asdf b/sourced/ml/core/tests/asdf/topics.asdf
new file mode 100644
index 0000000..50b89dd
Binary files /dev/null and b/sourced/ml/core/tests/asdf/topics.asdf differ
diff --git a/sourced/ml/core/tests/asdf/uast.asdf b/sourced/ml/core/tests/asdf/uast.asdf
new file mode 100644
index 0000000..8ca7458
Binary files /dev/null and b/sourced/ml/core/tests/asdf/uast.asdf differ
diff --git a/sourced/ml/core/tests/asdf/voccoocc.asdf b/sourced/ml/core/tests/asdf/voccoocc.asdf
new file mode 100644
index 0000000..835ab91
Binary files /dev/null and b/sourced/ml/core/tests/asdf/voccoocc.asdf differ
diff --git a/sourced/ml/core/tests/identifiers.csv.tar.gz b/sourced/ml/core/tests/identifiers.csv.tar.gz
new file mode 100644
index 0000000..4fac851
Binary files /dev/null and b/sourced/ml/core/tests/identifiers.csv.tar.gz differ
diff --git a/sourced/ml/core/tests/models.py b/sourced/ml/core/tests/models.py
new file mode 100644
index 0000000..6eeb379
--- /dev/null
+++ b/sourced/ml/core/tests/models.py
@@ -0,0 +1,25 @@
+from os.path import dirname, join
+
+_root = dirname(__file__)
+_models_path = join(_root, "asdf")
+
+ID2VEC = join(_models_path, "id2vec_1000.asdf")
+DOCFREQ = join(_models_path, "docfreq_1000.asdf")
+QUANTLEVELS = join(_models_path, "quant.asdf")
+BOW = join(_models_path, "bow.asdf")
+COOCC = join(_models_path, "coocc.asdf")
+COOCC_DF = join(_models_path, "coocc_df.asdf")
+UAST = join(_models_path, "uast.asdf")
+TOPICS = join(_models_path, "topics.asdf")
+
+DATA_DIR_SOURCE = join(_root, "source")
+SOURCE_FILENAME = "example"
+SOURCE = join(DATA_DIR_SOURCE, "%s.asdf" % SOURCE_FILENAME)
+SOURCE_PY = join(DATA_DIR_SOURCE, "%s.py" % SOURCE_FILENAME)
+
+TOPICS_SRC = "topics_readable.txt"
+PARQUET_DIR = join(_root, "parquet")
+SIVA_DIR = join(_root, "siva")
+IDENTIFIERS = join(_root, "identifiers.csv.tar.gz")
+
+MODER_FUNC = join(DATA_DIR_SOURCE, "example_functions.py")
diff --git a/sourced/ml/core/tests/models/test_tensorflow.py b/sourced/ml/core/tests/models/test_tensorflow.py
new file mode 100644
index 0000000..f33d441
--- /dev/null
+++ b/sourced/ml/core/tests/models/test_tensorflow.py
@@ -0,0 +1,32 @@
+import io
+import unittest
+
+from sourced.ml.core.models.tensorflow import TensorFlowModel
+from sourced.ml.core.tests import has_tensorflow
+
+
+class TensorFlowModelTests(unittest.TestCase):
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_serialize(self):
+        import tensorflow as tf
+        a = tf.constant([[1, 0], [0, 1]])
+        b = tf.constant([[0, 1], [1, 0]])
+        c = tf.matmul(a, b)
+        gd = tf.get_default_graph().as_graph_def()
+        buffer = io.BytesIO()
+        TensorFlowModel().construct(graphdef=gd).save(buffer, series="tensorflow-model")
+        buffer.seek(0)
+        model = TensorFlowModel().load(buffer)
+        self.assertEqual(gd.node, model.graphdef.node)
+
+        buffer = io.BytesIO()
+        with tf.Session() as session:
+            TensorFlowModel().construct(session=session, outputs=[c.name[:-2]]).save(
+                buffer, series="tensorflow-model")
+        buffer.seek(0)
+        model = TensorFlowModel().load(buffer)
+        self.assertEqual(gd.node, model.graphdef.node)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/source/__init__.py b/sourced/ml/core/tests/source/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sourced/ml/core/tests/source/example.py b/sourced/ml/core/tests/source/example.py
new file mode 100644
index 0000000..11a01bc
--- /dev/null
+++ b/sourced/ml/core/tests/source/example.py
@@ -0,0 +1,16 @@
+import sys
+
+from modelforge.logs import setup_logging
+
+
+utmain = sys.modules["__main__"]
+if utmain.__package__ == "unittest" and utmain.__spec__ is None:
+    from collections import namedtuple
+    ModuleSpec = namedtuple("ModuleSpec", ["name"])
+    utmain.__spec__ = ModuleSpec("unittest.__main__")
+    del ModuleSpec
+del utmain
+
+
+def setup():
+    setup_logging("INFO")
diff --git a/sourced/ml/core/tests/source/example_functions.py b/sourced/ml/core/tests/source/example_functions.py
new file mode 100644
index 0000000..fd7d05a
--- /dev/null
+++ b/sourced/ml/core/tests/source/example_functions.py
@@ -0,0 +1,16 @@
+class Foo:
+    def func_a(self):
+        # should be counted
+        pass
+
+
+def func_b():
+    # should be counted
+    pass
+
+
+def func_c():
+    # should be counted
+    def func_d():
+        # should not be counted
+        pass
diff --git a/sourced/ml/core/tests/swivel/col_sums.txt b/sourced/ml/core/tests/swivel/col_sums.txt
new file mode 100644
index 0000000..44ea5c6
--- /dev/null
+++ b/sourced/ml/core/tests/swivel/col_sums.txt
@@ -0,0 +1,304 @@
+21
+58
+76
+59
+38
+92
+102
+102
+58
+26
+23
+44
+77
+68
+50
+49
+76
+25
+76
+49
+37
+44
+61
+75
+30
+90
+79
+102
+12
+3
+27
+74
+57
+62
+59
+58
+51
+100
+34
+75
+32
+113
+55
+19
+43
+65
+41
+49
+49
+39
+22
+22
+99
+37
+16
+29
+86
+49
+13
+14
+28
+44
+80
+50
+29
+20
+13
+154
+24
+76
+62
+70
+128
+27
+21
+39
+11
+45
+43
+124
+92
+80
+141
+88
+23
+24
+50
+52
+175
+43
+115
+12
+29
+16
+49
+48
+19
+95
+10
+63
+37
+102
+59
+21
+106
+76
+65
+50
+95
+59
+26
+29
+23
+21
+91
+78
+8
+78
+142
+50
+76
+112
+76
+61
+37
+133
+55
+93
+37
+19
+13
+16
+20
+32
+31
+31
+31
+64
+13
+58
+11
+21
+198
+77
+39
+50
+7
+59
+111
+12
+50
+102
+29
+141
+55
+58
+13
+39
+30
+22
+10
+27
+60
+40
+4
+58
+50
+76
+52
+74
+41
+59
+40
+74
+40
+40
+156
+73
+16
+32
+34
+31
+27
+93
+58
+31
+27
+19
+28
+64
+82
+44
+37
+37
+31
+62
+39
+95
+205
+25
+14
+18
+95
+26
+56
+10
+29
+59
+74
+24
+72
+19
+42
+18
+64
+33
+34
+54
+41
+51
+74
+41
+12
+9
+35
+25
+73
+39
+11
+76
+33
+9
+36
+52
+72
+27
+62
+45
+26
+149
+104
+64
+24
+64
+19
+26
+34
+21
+21
+22
+22
+97
+15
+61
+70
+27
+22
+85
+20
+107
+100
+104
+78
+17
+63
+40
+11
+141
+27
+30
+24
+78
+167
+32
+19
+89
+59
+46
+22
+6
+55
+50
+79
+5
+38
+12
+50
+97
+78
+29
+55
+72
+17
+95
+76
+59
+76
+61
+9
+45
+26
+32
+107
+19
\ No newline at end of file
diff --git a/sourced/ml/core/tests/swivel/col_sums.txt.gz b/sourced/ml/core/tests/swivel/col_sums.txt.gz
new file mode 100644
index 0000000..081051d
Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_sums.txt.gz differ
diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt b/sourced/ml/core/tests/swivel/col_vocab.txt
new file mode 100644
index 0000000..78d2478
--- /dev/null
+++ b/sourced/ml/core/tests/swivel/col_vocab.txt
@@ -0,0 +1,304 @@
+i.access
+i.action
+i.activ
+i.adapt
+i.add
+i.android
+i.antonioleiva
+i.app
+i.append
+i.appgroup
+i.args
+i.argument
+i.argv
+i.array
+i.arrays
+i.aslist
+i.bar
+i.build
+i.bundle
+i.button
+i.call
+i.captur
+i.chdir
+i.check
+i.clear
+i.click
+i.close
+i.com
+i.command
+i.communic
+i.compil
+i.confdir
+i.conffil
+i.conffilenam
+i.config
+i.conn
+i.connect
+i.content
+i.copytre
+i.count
+i.counter
+i.create
+i.credenti
+i.crypt
+i.cursor
+i.data
+i.date
+i.datetim
+i.dbfile
+i.delay
+i.destdir
+i.destfil
+i.destroy
+i.dev
+i.develop
+i.dict
+i.dir
+i.directori
+i.dirnam
+i.dirpath
+i.dirs
+i.docs
+i.dropbox
+i.edit
+i.empty
+i.endswith
+i.env
+i.error
+i.euroclear
+i.except
+i.execut
+i.exists
+i.exit
+i.expandus
+i.ext
+i.extens
+i.factori
+i.false
+i.fetchon
+i.file
+i.filenam
+i.files
+i.find
+i.finish
+i.float
+i.fname
+i.for
+i.format
+i.get
+i.getcwd
+i.getenv
+i.gethostbynam
+i.getsiz
+i.github
+i.gone
+i.handler
+i.header
+i.hide
+i.hkey
+i.home
+i.host
+i.impl
+i.inflat
+i.info
+i.input
+i.instanc
+i.int
+i.intent
+i.interactor
+i.invis
+i.isdir
+i.isempti
+i.isfile
+i.isoformat
+i.item
+i.items
+i.iter
+i.java
+i.join
+i.key
+i.layout
+i.len
+i.length
+i.line
+i.linux
+i.list
+i.listdir
+i.listen
+i.listfil
+i.lite
+i.ljust
+i.local
+i.localtim
+i.log
+i.logdir
+i.logfil
+i.logfilenam
+i.login
+i.logsdir
+i.long
+i.lower
+i.machin
+i.main
+i.make
+i.makedir
+i.master
+i.math
+i.menu
+i.messag
+i.min
+i.move
+i.mvpexampl
+i.myping
+i.name
+i.navig
+i.new
+i.newenv
+i.newfil
+i.newscript
+i.nmap
+i.nmscan
+i.node
+i.none
+i.number
+i.object
+i.old
+i.onclick
+i.oncreat
+i.onfinish
+i.onitem
+i.onlogin
+i.onopt
+i.onpassword
+i.onresum
+i.onsuccess
+i.onusernam
+i.open
+i.option
+i.optpars
+i.ospath
+i.output
+i.outputdir
+i.outputfil
+i.overrid
+i.parent
+i.parse
+i.parser
+i.pass
+i.passwd
+i.password
+i.path
+i.platform
+i.popen
+i.port
+i.ports
+i.posit
+i.post
+i.present
+i.print
+i.processor
+i.profil
+i.program
+i.progress
+i.putty
+i.query
+i.randint
+i.random
+i.range
+i.raw
+i.rdp
+i.read
+i.readfil
+i.readlin
+i.recycl
+i.releas
+i.remove
+i.rename
+i.replac
+i.res
+i.result
+i.resume
+i.ret
+i.return
+i.root
+i.row
+i.rows
+i.run
+i.runnabl
+i.salt
+i.saved
+i.scan
+i.scanner
+i.screen
+i.script
+i.select
+i.send
+i.server
+i.serverfil
+i.session
+i.set
+i.show
+i.shutil
+i.sid
+i.simple
+i.sisdir
+i.site
+i.size
+i.sleep
+i.socket
+i.sourcedir
+i.sourcefil
+i.split
+i.splitext
+i.sqlite
+i.start
+i.startswith
+i.stat
+i.state
+i.stats
+i.str
+i.strftime
+i.string
+i.strip
+i.subnet
+i.subprocess
+i.success
+i.sum
+i.sys
+i.system
+i.table
+i.tablelist
+i.test
+i.text
+i.tgt
+i.thread
+i.time
+i.toast
+i.today
+i.todaystr
+i.tofile
+i.tohome
+i.tostr
+i.true
+i.txt
+i.type
+i.usage
+i.user
+i.usernam
+i.util
+i.utils
+i.valid
+i.value
+i.version
+i.view
+i.visibl
+i.walk
+i.widget
+i.window
+i.winreg
+i.with
+i.word
+i.work
+i.write
+i.zip
\ No newline at end of file
diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt.gz b/sourced/ml/core/tests/swivel/col_vocab.txt.gz
new file mode 100644
index 0000000..73b3620
Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_vocab.txt.gz differ
diff --git a/sourced/ml/core/tests/swivel/row_sums.txt b/sourced/ml/core/tests/swivel/row_sums.txt
new file mode 100644
index 0000000..44ea5c6
--- /dev/null
+++ b/sourced/ml/core/tests/swivel/row_sums.txt
@@ -0,0 +1,304 @@
+21
+58
+76
+59
+38
+92
+102
+102
+58
+26
+23
+44
+77
+68
+50
+49
+76
+25
+76
+49
+37
+44
+61
+75
+30
+90
+79
+102
+12
+3
+27
+74
+57
+62
+59
+58
+51
+100
+34
+75
+32
+113
+55
+19
+43
+65
+41
+49
+49
+39
+22
+22
+99
+37
+16
+29
+86
+49
+13
+14
+28
+44
+80
+50
+29
+20
+13
+154
+24
+76
+62
+70
+128
+27
+21
+39
+11
+45
+43
+124
+92
+80
+141
+88
+23
+24
+50
+52
+175
+43
+115
+12
+29
+16
+49
+48
+19
+95
+10
+63
+37
+102
+59
+21
+106
+76
+65
+50
+95
+59
+26
+29
+23
+21
+91
+78
+8
+78
+142
+50
+76
+112
+76
+61
+37
+133
+55
+93
+37
+19
+13
+16
+20
+32
+31
+31
+31
+64
+13
+58
+11
+21
+198
+77
+39
+50
+7
+59
+111
+12
+50
+102
+29
+141
+55
+58
+13
+39
+30
+22
+10
+27
+60
+40
+4
+58
+50
+76
+52
+74
+41
+59
+40
+74
+40
+40
+156
+73
+16
+32
+34
+31
+27
+93
+58
+31
+27
+19
+28
+64
+82
+44
+37
+37
+31
+62
+39
+95
+205
+25
+14
+18
+95
+26
+56
+10
+29
+59
+74
+24
+72
+19
+42
+18
+64
+33
+34
+54
+41
+51
+74
+41
+12
+9
+35
+25
+73
+39
+11
+76
+33
+9
+36
+52
+72
+27
+62
+45
+26
+149
+104
+64
+24
+64
+19
+26
+34
+21
+21
+22
+22
+97
+15
+61
+70
+27
+22
+85
+20
+107
+100
+104
+78
+17
+63
+40
+11
+141
+27
+30
+24
+78
+167
+32
+19
+89
+59
+46
+22
+6
+55
+50
+79
+5
+38
+12
+50
+97
+78
+29
+55
+72
+17
+95
+76
+59
+76
+61
+9
+45
+26
+32
+107
+19
\ No newline at end of file
diff --git a/sourced/ml/core/tests/swivel/row_sums.txt.gz b/sourced/ml/core/tests/swivel/row_sums.txt.gz
new file mode 100644
index 0000000..5680a5a
Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_sums.txt.gz differ
diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt b/sourced/ml/core/tests/swivel/row_vocab.txt
new file mode 100644
index 0000000..78d2478
--- /dev/null
+++ b/sourced/ml/core/tests/swivel/row_vocab.txt
@@ -0,0 +1,304 @@
+i.access
+i.action
+i.activ
+i.adapt
+i.add
+i.android
+i.antonioleiva
+i.app
+i.append
+i.appgroup
+i.args
+i.argument
+i.argv
+i.array
+i.arrays
+i.aslist
+i.bar
+i.build
+i.bundle
+i.button
+i.call
+i.captur
+i.chdir
+i.check
+i.clear
+i.click
+i.close
+i.com
+i.command
+i.communic
+i.compil
+i.confdir
+i.conffil
+i.conffilenam
+i.config
+i.conn
+i.connect
+i.content
+i.copytre
+i.count
+i.counter
+i.create
+i.credenti
+i.crypt
+i.cursor
+i.data
+i.date
+i.datetim
+i.dbfile
+i.delay
+i.destdir
+i.destfil
+i.destroy
+i.dev
+i.develop
+i.dict
+i.dir
+i.directori
+i.dirnam
+i.dirpath
+i.dirs
+i.docs
+i.dropbox
+i.edit
+i.empty
+i.endswith
+i.env
+i.error
+i.euroclear
+i.except
+i.execut
+i.exists
+i.exit
+i.expandus
+i.ext
+i.extens
+i.factori
+i.false
+i.fetchon
+i.file
+i.filenam
+i.files
+i.find
+i.finish
+i.float
+i.fname
+i.for
+i.format
+i.get
+i.getcwd
+i.getenv
+i.gethostbynam
+i.getsiz
+i.github
+i.gone
+i.handler
+i.header
+i.hide
+i.hkey
+i.home
+i.host
+i.impl
+i.inflat
+i.info
+i.input
+i.instanc
+i.int
+i.intent
+i.interactor
+i.invis
+i.isdir
+i.isempti
+i.isfile
+i.isoformat
+i.item
+i.items
+i.iter
+i.java
+i.join
+i.key
+i.layout
+i.len
+i.length
+i.line
+i.linux
+i.list
+i.listdir
+i.listen
+i.listfil
+i.lite
+i.ljust
+i.local
+i.localtim
+i.log
+i.logdir
+i.logfil
+i.logfilenam
+i.login
+i.logsdir
+i.long
+i.lower
+i.machin
+i.main
+i.make
+i.makedir
+i.master
+i.math
+i.menu
+i.messag
+i.min
+i.move
+i.mvpexampl
+i.myping
+i.name
+i.navig
+i.new
+i.newenv
+i.newfil
+i.newscript
+i.nmap
+i.nmscan
+i.node
+i.none
+i.number
+i.object
+i.old
+i.onclick
+i.oncreat
+i.onfinish
+i.onitem
+i.onlogin
+i.onopt
+i.onpassword
+i.onresum
+i.onsuccess
+i.onusernam
+i.open
+i.option
+i.optpars
+i.ospath
+i.output
+i.outputdir
+i.outputfil
+i.overrid
+i.parent
+i.parse
+i.parser
+i.pass
+i.passwd
+i.password
+i.path
+i.platform
+i.popen
+i.port
+i.ports
+i.posit
+i.post
+i.present
+i.print
+i.processor
+i.profil
+i.program
+i.progress
+i.putty
+i.query
+i.randint
+i.random
+i.range
+i.raw
+i.rdp
+i.read
+i.readfil
+i.readlin
+i.recycl
+i.releas
+i.remove
+i.rename
+i.replac
+i.res
+i.result
+i.resume
+i.ret
+i.return
+i.root
+i.row
+i.rows
+i.run
+i.runnabl
+i.salt
+i.saved
+i.scan
+i.scanner
+i.screen
+i.script
+i.select
+i.send
+i.server
+i.serverfil
+i.session
+i.set
+i.show
+i.shutil
+i.sid
+i.simple
+i.sisdir
+i.site
+i.size
+i.sleep
+i.socket
+i.sourcedir
+i.sourcefil
+i.split
+i.splitext
+i.sqlite
+i.start
+i.startswith
+i.stat
+i.state
+i.stats
+i.str
+i.strftime
+i.string
+i.strip
+i.subnet
+i.subprocess
+i.success
+i.sum
+i.sys
+i.system
+i.table
+i.tablelist
+i.test
+i.text
+i.tgt
+i.thread
+i.time
+i.toast
+i.today
+i.todaystr
+i.tofile
+i.tohome
+i.tostr
+i.true
+i.txt
+i.type
+i.usage
+i.user
+i.usernam
+i.util
+i.utils
+i.valid
+i.value
+i.version
+i.view
+i.visibl
+i.walk
+i.widget
+i.window
+i.winreg
+i.with
+i.word
+i.work
+i.write
+i.zip
\ No newline at end of file
diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt.gz b/sourced/ml/core/tests/swivel/row_vocab.txt.gz
new file mode 100644
index 0000000..5dd2fff
Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_vocab.txt.gz differ
diff --git a/sourced/ml/core/tests/swivel/shard-000-000.pb.gz b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz
new file mode 100644
index 0000000..7f2bf35
Binary files /dev/null and b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz differ
diff --git a/sourced/ml/core/tests/test_bblfsh_utils.py b/sourced/ml/core/tests/test_bblfsh_utils.py
new file mode 100644
index 0000000..1e6f6c5
--- /dev/null
+++ b/sourced/ml/core/tests/test_bblfsh_utils.py
@@ -0,0 +1,85 @@
+import errno
+import os
+import random
+import socket
+import time
+import unittest
+
+import docker.client
+
+from sourced.ml.core.utils.bblfsh import BBLFSH_VERSION_HIGH, BBLFSH_VERSION_LOW, check_version
+
+
+@unittest.skipIf(os.getenv("SKIP_BBLFSH_UTILS_TESTS", False), "Skip ml_core.utils.bblfsh tests.")
+class BblfshUtilsTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.docker_client = docker.from_env()
+        # ensure docker is running
+        try:
+            cls.docker_client.containers.list()
+        except Exception:
+            raise Exception("docker not running properly")
+        cls.er_msg = "supported bblfshd versions: " \
+                     ">=%s,<%s" % (BBLFSH_VERSION_LOW, BBLFSH_VERSION_HIGH)
+
+    def __check_bblfsh_version_support(self, version: str) -> bool:
+        """
+        :param version: version of bblfshd to check
+        :return: True if version is supported, False otherwise
+        """
+        with socket.socket() as s:
+            for _ in range(3):
+                try:
+                    port = random.randint(10000, 50000)
+                    s.connect(("localhost", port))
+                except socket.error as e:
+                    if e.errno == errno.ECONNREFUSED:
+                        break
+
+        container = self.docker_client.containers.run(
+            image="bblfsh/bblfshd:%s" % version,
+            privileged=True,
+            detach=True,
+            ports={"9432": port},
+        )
+
+        assert container is not None, "failed to create bblfsh container"
+
+        for _ in range(10):
+            try:
+                res = check_version(port=port)
+                break
+            except Exception:
+                time.sleep(.1)
+                pass
+
+        container.stop()
+        container.remove()
+        return res
+
+    def test_v200(self):
+        self.assertFalse(self.__check_bblfsh_version_support("v2.0.0"), self.er_msg)
+
+    def test_v210(self):
+        self.assertFalse(self.__check_bblfsh_version_support("v2.1.0"), self.er_msg)
+
+    def test_v220(self):
+        self.assertTrue(self.__check_bblfsh_version_support("v2.2.0"), self.er_msg)
+
+    def test_v230(self):
+        self.assertTrue(self.__check_bblfsh_version_support("v2.3.0"), self.er_msg)
+
+    def test_v240(self):
+        self.assertTrue(self.__check_bblfsh_version_support("v2.4.0"), self.er_msg)
+
+    def test_v250(self):
+        self.assertTrue(self.__check_bblfsh_version_support("v2.5.0"), self.er_msg)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.docker_client.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_bigartm.py b/sourced/ml/core/tests/test_bigartm.py
new file mode 100644
index 0000000..1228c13
--- /dev/null
+++ b/sourced/ml/core/tests/test_bigartm.py
@@ -0,0 +1,29 @@
+import argparse
+import os
+import subprocess
+import tempfile
+import unittest
+
+from sourced.ml.core.utils import install_bigartm
+
+
+class BigartmTests(unittest.TestCase):
+    gitdir = os.path.join(os.path.dirname(__file__), "..", "..")
+
+    @unittest.skipUnless(os.getenv("FULL_TEST", False), "Need to define FULL_TEST env var.")
+    def test_install_bigartm(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            args = argparse.Namespace(output=tmpdir, tmpdir=None)
+            self.assertIsNone(install_bigartm(args))
+            self._valivate_bigartm(tmpdir)
+
+    def _valivate_bigartm(self, tmpdir):
+        bigartm = os.path.join(tmpdir, "bigartm")
+        self.assertTrue(os.path.isfile(bigartm))
+        self.assertEqual(os.stat(bigartm).st_mode & 0o777, 0o777)
+        output = subprocess.check_output([bigartm], stderr=subprocess.STDOUT)
+        self.assertIn("BigARTM v", output.decode())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_bow.py b/sourced/ml/core/tests/test_bow.py
new file mode 100644
index 0000000..4d054d0
--- /dev/null
+++ b/sourced/ml/core/tests/test_bow.py
@@ -0,0 +1,44 @@
+from io import BytesIO
+import unittest
+
+import numpy
+
+from sourced.ml.core.models import BOW
+import sourced.ml.core.tests.models as paths
+
+
+class BOWTests(unittest.TestCase):
+    def setUp(self):
+        self.model = BOW().load(source=paths.BOW)
+
+    def test_getitem(self):
+        repo_name, indices, weights = self.model[0]
+        self.assertEqual(repo_name, "repo1")
+        self.assertIsInstance(indices, numpy.ndarray)
+        self.assertIsInstance(weights, numpy.ndarray)
+        self.assertEqual(indices.shape, weights.shape)
+        self.assertEqual(indices.shape, (3,))
+
+    def test_iter(self):
+        pumped = list(self.model)
+        self.assertEqual(len(pumped), 5)
+        self.assertEqual(pumped, list(range(5)))
+
+    def test_len(self):
+        self.assertEqual(len(self.model), 5)
+
+    def test_tokens(self):
+        self.assertEqual(self.model.tokens[0], "i.")
+
+    def test_write(self):
+        buffer = BytesIO()
+        self.model.save(output=buffer, series="bow-docfreq")
+        buffer.seek(0)
+        new_model = BOW().load(buffer)
+        self.assertEqual((self.model.matrix != new_model.matrix).nnz, 0)
+        self.assertEqual(self.model.documents, new_model.documents)
+        self.assertEqual(self.model.tokens, new_model.tokens)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_coocc.py b/sourced/ml/core/tests/test_coocc.py
new file mode 100644
index 0000000..d21a21b
--- /dev/null
+++ b/sourced/ml/core/tests/test_coocc.py
@@ -0,0 +1,28 @@
+import unittest
+
+from sourced.ml.core.models import Cooccurrences
+import sourced.ml.core.tests.models as paths
+
+
+class CooccurrencesTests(unittest.TestCase):
+    def setUp(self):
+        self.model = Cooccurrences().load(source=paths.COOCC)
+
+    def test_tokens(self):
+        tokens = self.model.tokens
+        self.assertIsInstance(tokens, list)
+        self.assertEqual(tokens[:10], ["i.set", "i.iter", "i.error", "i.logsdir", "i.read",
+                                       "i.captur", "i.clear", "i.android", "i.tohome", "i.ljust"])
+        self.assertEqual(len(tokens), 304)
+
+    def test_matrix(self):
+        matrix = self.model.matrix
+        self.assertEqual(matrix.shape, (304, 304))
+        self.assertEqual(matrix.getnnz(), 16001)
+
+    def test_len(self):
+        self.assertEqual(len(self.model), 304)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_df.py b/sourced/ml/core/tests/test_df.py
new file mode 100644
index 0000000..b7b2872
--- /dev/null
+++ b/sourced/ml/core/tests/test_df.py
@@ -0,0 +1,101 @@
+from io import BytesIO
+import unittest
+
+from sourced.ml.core.models import DocumentFrequencies
+import sourced.ml.core.tests.models as paths
+
+
+class DocumentFrequenciesTests(unittest.TestCase):
+    def setUp(self):
+        self.model = DocumentFrequencies().load(source=paths.DOCFREQ)
+
+    def test_docs(self):
+        docs = self.model.docs
+        self.assertIsInstance(docs, int)
+        self.assertEqual(docs, 1000)
+
+    def test_get(self):
+        self.assertEqual(self.model["aaaaaaa"], 341)
+        with self.assertRaises(KeyError):
+            print(self.model["xaaaaaa"])
+        self.assertEqual(self.model.get("aaaaaaa", 0), 341)
+        self.assertEqual(self.model.get("xaaaaaa", 100500), 100500)
+
+    def test_tokens(self):
+        self.assertEqual(list(self.model._df), self.model.tokens())
+
+    def test_len(self):
+        # the remaining 18 are not unique - the model was generated badly
+        self.assertEqual(len(self.model), 982)
+
+    def test_iter(self):
+        aaa = False
+        for tok, freq in self.model:
+            if "aaaaaaa" in tok:
+                aaa = True
+                int(freq)
+                break
+        self.assertTrue(aaa)
+
+    def test_prune(self):
+        pruned = self.model.prune(4)
+        for _, freq in pruned:
+            self.assertGreaterEqual(freq, 4)
+        self.assertEqual(len(pruned), 346)
+
+    def test_prune_self(self):
+        pruned = self.model.prune(1)
+        self.assertIs(self.model, pruned)
+
+    def test_greatest(self):
+        pruned = self.model.greatest(100)
+        freqs = [v for v in self.model._df.values()]
+        freqs.sort(reverse=True)
+        border = freqs[100]
+        for v in pruned._df.values():
+            self.assertGreaterEqual(v, border)
+        df1 = pruned._df
+        df2 = self.model.greatest(100)._df
+        self.assertEqual(df1, df2)
+
+    def test_greatest2(self):
+        df = DocumentFrequencies().construct(100, {str(x): x for x in range(1000)})
+        df_greatest_true = {str(x): x for x in range(500, 1000)}
+        df_greatest = df.greatest(500)
+        self.assertEqual(df_greatest._df, df_greatest_true)
+
+        df._df["500a"] = 500
+        df._df["500b"] = 500
+        df._df["500c"] = 500
+        df._df["500d"] = 500
+        df._df["500e"] = 500
+
+        df_greatest = df.greatest(500)
+        self.assertEqual(df_greatest._df, df_greatest_true)
+
+        df_greatest_true["500a"] = 500
+        df_greatest = df.greatest(501)
+        self.assertEqual(df_greatest._df, df_greatest_true)
+
+        df_greatest_true["500b"] = 500
+        df_greatest_true["500c"] = 500
+        df_greatest_true["500d"] = 500
+        df_greatest_true["500e"] = 500
+        df_greatest = df.greatest(505)
+        self.assertEqual(df_greatest._df, df_greatest_true)
+
+        df_greatest_true["499"] = 499
+        df_greatest = df.greatest(506)
+        self.assertEqual(df_greatest._df, df_greatest_true)
+
+    def test_write(self):
+        buffer = BytesIO()
+        self.model.save(buffer)
+        buffer.seek(0)
+        new_model = DocumentFrequencies().load(buffer)
+        self.assertEqual(self.model._df, new_model._df)
+        self.assertEqual(self.model.docs, new_model.docs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_dump.py b/sourced/ml/core/tests/test_dump.py
new file mode 100644
index 0000000..74d2d19
--- /dev/null
+++ b/sourced/ml/core/tests/test_dump.py
@@ -0,0 +1,131 @@
+import argparse
+from contextlib import contextmanager
+from io import StringIO
+import logging
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+from modelforge.tools import dump_model
+
+import sourced.ml.core.tests.models as paths
+
+
+cache_dir = os.path.join(tempfile.gettempdir(), "ml-test-dump")
+
+
+@contextmanager
+def captured_output():
+    log = StringIO()
+    log_handler = logging.StreamHandler(log)
+    logging.getLogger().addHandler(log_handler)
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr, log
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+        logging.getLogger().removeHandler(log_handler)
+
+
+class DumpTests(unittest.TestCase):
+    ID2VEC_DUMP = """{'created_at': '2017-06-18 17:37:06', \
+'dependencies': [], \
+'license': 'ODbL-1.0', \
+'model': 'id2vec', \
+'series': 'id2vec-1000', \
+'size': '1.1 MB', \
+'uuid': '92609e70-f79c-46b5-8419-55726e873cfc', \
+'vendor': 'source{d}', \
+'version': [1, 0, 0]}
+Shape: (1000, 300)
+First 10 words: ['get', 'name', 'type', 'string', 'class', 'set', 'data', 'value', 'self', 'test']
+"""
+
+    DOCFREQ_DUMP = """{'created_at': '2017-08-09 16:49:12', \
+'dependencies': [], \
+'license': 'ODbL-1.0', \
+'model': 'docfreq', \
+'series': 'docfreq-1000', \
+'size': '6.4 kB', \
+'uuid': 'f64bacd4-67fb-4c64-8382-399a8e7db52a', \
+'vendor': 'source{d}', \
+'version': [0, 1, 0]}
+Number of words: 982
+""" + "Random 10 words: "
+
+    BOW_DUMP = """{'created_at': '2018-01-18 21:59:59', \
+'dependencies': [{'created_at': datetime.datetime(2018, 1, 18, 21, 59, 48, 828287), \
+'dependencies': [], \
+'model': 'docfreq', \
+'uuid': '2c4fcae7-93a6-496e-9e3a-d6e15d35b812', \
+'version': [1, 0, 0]}], \
+'license': 'ODbL-1.0', \
+'model': 'bow', \
+'parent': '51b4165d-b2c6-442a-93be-0eb35f4cc19a', \
+'series': 'bow-docfreq', \
+'size': '2.5 kB', \
+'uuid': '0d95f342-2c69-459f-9ee7-a1fc7da88d64', \
+'vendor': 'source{d}', \
+'version': [1, 0, 15]}
+Shape: (5, 20)
+First 10 documents: ['repo1', 'repo2', 'repo3', 'repo4', 'repo5']
+First 10 tokens: ['i.', 'i.*', 'i.Activity', 'i.AdapterView', 'i.ArrayAdapter', 'i.Arrays', 'i.Bundle', 'i.EditText', 'i.Exception', 'i.False']\n"""  # noqa
+
+    COOCC_DUMP = """{'created_at': '2018-01-24 16:00:02', \
+'dependencies': [{'created_at': datetime.datetime(2018, 1, 24, 15, 59, 24, 129470), \
+'dependencies': [], \
+'model': 'docfreq', \
+'uuid': '0f94a6c6-7dc3-4b3c-b8d2-917164a50581', \
+'version': [1, 0, 0]}], \
+'license': 'ODbL-1.0', \
+'model': 'co-occurrences', \
+'series': 'coocc-docfreq', \
+'size': '79.9 kB', \
+'uuid': 'e75dcb2d-ec1d-476b-a04b-bc64c7779ae1', \
+'vendor': 'source{d}', \
+'version': [1, 0, 0]}
+Number of words: 304
+First 10 words: ['i.set', 'i.iter', 'i.error', 'i.logsdir', 'i.read', 'i.captur', 'i.clear',""" + \
+                 """ 'i.android', 'i.tohome', 'i.ljust']
+Matrix: shape: (304, 304) non-zero: 16001
+"""
+
+    def tearDown(self):
+        if os.path.exists(cache_dir):
+            shutil.rmtree(cache_dir)
+
+    def test_id2vec(self):
+        with captured_output() as (out, _, _):
+            dump_model(self._get_args(input=paths.ID2VEC))
+        self.assertEqual(out.getvalue(), self.ID2VEC_DUMP)
+
+    def test_docfreq(self):
+        with captured_output() as (out, _, _):
+            dump_model(self._get_args(input=paths.DOCFREQ))
+        self.assertEqual(out.getvalue()[:len(self.DOCFREQ_DUMP)], self.DOCFREQ_DUMP)
+        ending = "\nNumber of documents: 1000\n"
+        self.assertEqual(out.getvalue()[-len(ending):], ending)
+
+    def test_bow(self):
+        with captured_output() as (out, _, _):
+            dump_model(self._get_args(input=paths.BOW))
+        self.assertEqual(out.getvalue(), self.BOW_DUMP)
+
+    def test_coocc(self):
+        with captured_output() as (out, _, _):
+            dump_model(self._get_args(input=paths.COOCC))
+        self.assertEqual(out.getvalue(), self.COOCC_DUMP)
+
+    @staticmethod
+    def _get_args(input):
+        return argparse.Namespace(input=input, backend=None, args=None, username="",
+                                  password="", index_repo="https://github.com/src-d/models",
+                                  cache=cache_dir, signoff=False, log_level="WARNING")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_id_splitter_features.py b/sourced/ml/core/tests/test_id_splitter_features.py
new file mode 100644
index 0000000..6334d24
--- /dev/null
+++ b/sourced/ml/core/tests/test_id_splitter_features.py
@@ -0,0 +1,145 @@
+import io
+import tarfile
+import tempfile
+import unittest
+
+import numpy
+
+from sourced.ml.core.tests import has_tensorflow
+from sourced.ml.core.tests.models import IDENTIFIERS
+
+
+def write_fake_identifiers(tar_file, n_lines, char_sizes, n_cols, text="a"):
+    """
+    Prepare file with fake identifiers.
+    :param tar_file: ready to write file.
+    :param n_lines: number of lines to generate.
+    :param char_sizes: sizes of identifiers.
+    :param n_cols: number of columns.
+    :param text: text that is used to fill identifiers.
+    """
+    # sanity check
+    if isinstance(char_sizes, int):
+        char_sizes = [char_sizes] * n_lines
+    assert len(char_sizes) == n_lines
+
+    # generate file
+    res = []
+    for sz in char_sizes:
+        line = ",".join([text * sz] * n_cols)
+        res.append(line)
+    content = "\n".join(res)
+    content = content.encode("utf-8")
+
+    # add content to file
+    info = tarfile.TarInfo("identifiers.txt")
+    info.size = len(content)
+    tar_file.addfile(info, io.BytesIO(content))
+
+
+class IdSplitterTest(unittest.TestCase):
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_prepare_features(self):
+        from sourced.ml.core.algorithms.id_splitter.features import prepare_features
+        # check feature extraction
+        text = "a a"
+        n_lines = 10
+        max_identifier_len = 20
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text)
+            feat = prepare_features(csv_path=tmp.name, use_header=True, identifier_col=0,
+                                    max_identifier_len=max_identifier_len, split_identifier_col=1,
+                                    shuffle=True, test_ratio=0.5, padding="post")
+            x_train, x_test, y_train, y_test = feat
+            # because of test_ratio=0.5 - shapes should be equal
+            self.assertEqual(x_test.shape, x_train.shape)
+            self.assertEqual(y_test.shape, y_train.shape)
+            # each line contains only one split -> so it should be only 5 nonzero for train/test
+            self.assertEqual(numpy.sum(y_test), 5)
+            self.assertEqual(numpy.sum(y_train), 5)
+            # each line contains only two chars -> so it should be only 10 nonzero for train/test
+            self.assertEqual(numpy.count_nonzero(x_test), 10)
+            self.assertEqual(numpy.count_nonzero(x_train), 10)
+            # y should be 3 dimensional matrix
+            self.assertEqual(y_test.ndim, 3)
+            self.assertEqual(y_train.ndim, 3)
+            # x should be 2 dimensional matrix
+            self.assertEqual(x_test.ndim, 2)
+            self.assertEqual(x_train.ndim, 2)
+            # check number of samples
+            self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines)
+            self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines)
+            # check max_identifier_len
+            self.assertEqual(x_test.shape[1], max_identifier_len)
+            self.assertEqual(x_train.shape[1], max_identifier_len)
+            self.assertEqual(y_test.shape[1], max_identifier_len)
+            self.assertEqual(y_train.shape[1], max_identifier_len)
+
+        # normal file
+        try:
+            prepare_features(csv_path=IDENTIFIERS, use_header=True, identifier_col=0,
+                             max_identifier_len=max_identifier_len, split_identifier_col=1,
+                             shuffle=True, test_ratio=0.5, padding="post")
+        except Exception as e:
+            self.fail("prepare_features raised %s with log %s" % (type(e), str(e)))
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_read_identifiers(self):
+        from sourced.ml.core.algorithms.id_splitter.features import read_identifiers
+        # read with header
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)
+
+            res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10,
+                                   identifier_col=3, split_identifier_col=4)
+            self.assertEqual(len(res), 10)
+
+        # read without header
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)
+
+            res = read_identifiers(csv_path=tmp.name, use_header=False, max_identifier_len=10,
+                                   identifier_col=3, split_identifier_col=4)
+            self.assertEqual(len(res), 9)
+
+        # read with max_identifier_len equal to 0 -> expect empty list
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5)
+
+            res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=0,
+                                   identifier_col=3, split_identifier_col=4)
+            self.assertEqual(len(res), 0)
+
+        # generate temporary file with identifiers of specific lengths and filter by length
+        char_sizes = list(range(1, 11))
+
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5)
+
+            # check filtering
+            # read last two columns as identifiers
+            for i in range(11):
+                res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=i,
+                                       identifier_col=3, split_identifier_col=4)
+                self.assertEqual(len(res), i)
+
+        # read wrong columns
+        with tempfile.NamedTemporaryFile() as tmp:
+            with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar:
+                write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2)
+
+            with self.assertRaises(IndexError):
+                read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10,
+                                 identifier_col=3, split_identifier_col=4)
+
+        # normal file
+        try:
+            read_identifiers(csv_path=IDENTIFIERS, use_header=True, max_identifier_len=10,
+                             identifier_col=3, split_identifier_col=4)
+        except Exception as e:
+            self.fail("read_identifiers raised %s with log %s" % (type(e), str(e)))
diff --git a/sourced/ml/core/tests/test_id_splitter_nn_model.py b/sourced/ml/core/tests/test_id_splitter_nn_model.py
new file mode 100644
index 0000000..d98cfba
--- /dev/null
+++ b/sourced/ml/core/tests/test_id_splitter_nn_model.py
@@ -0,0 +1,56 @@
+import string
+import unittest
+
+import numpy
+
+from sourced.ml.core.tests import has_tensorflow
+
+
+class MetricsTests(unittest.TestCase):
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_register_metric(self):
+        from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS
+        fake_metric = "fake metric"
+        register_metric(fake_metric)
+        self.assertIn(fake_metric, METRICS)
+        METRICS.pop()
+        self.assertNotIn(fake_metric, METRICS)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_raise_register_metric(self):
+        from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS
+        bad_metric = 1
+        with self.assertRaises(AssertionError):
+            register_metric(bad_metric)
+        self.assertNotIn(bad_metric, METRICS)
+
+
+class ModelsTests(unittest.TestCase):
+    def setUp(self):
+        from sourced.ml.core.algorithms.id_splitter.nn_model import build_rnn, build_cnn
+        self.n_uniq = len(string.ascii_lowercase)
+        self.model_rnn = build_rnn(maxlen=5, units=24, stack=2, rnn_layer="LSTM",
+                                   optimizer="Adam", dev0="/cpu:0", dev1="/cpu:0")
+        self.model_cnn = build_cnn(maxlen=5, filters=[64, 32, 16, 8], output_n_filters=32,
+                                   stack=2, kernel_sizes=[2, 4, 8, 16], optimizer="Adam",
+                                   device="/cpu:0")
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_build_rnn(self):
+        self.assertTrue(self.model_rnn.built)
+        self.assertTrue(self.model_rnn.trainable)
+        self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray)
+        self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1))
+        self.assertTrue(self.model_rnn.uses_learning_phase)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_build_cnn(self):
+        self.assertTrue(self.model_cnn.built)
+        self.assertTrue(self.model_cnn.trainable)
+        self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray)
+        self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1))
+        self.assertTrue(self.model_cnn.uses_learning_phase)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_id_splitter_pipeline.py b/sourced/ml/core/tests/test_id_splitter_pipeline.py
new file mode 100644
index 0000000..ac8d384
--- /dev/null
+++ b/sourced/ml/core/tests/test_id_splitter_pipeline.py
@@ -0,0 +1,126 @@
+import tempfile
+import unittest
+
+import numpy
+
+from sourced.ml.core.tests import has_tensorflow
+
+
+class IdSplitterPipelineTest(unittest.TestCase):
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_binarize(self):
+        from sourced.ml.core.algorithms.id_splitter.pipeline import binarize
+        thresholds = [0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99]
+        n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+
+        for th, n_p in zip(thresholds, n_pos):
+            vals = numpy.arange(10) / 10
+            res = binarize(vals, th)
+            self.assertEqual(sum(binarize(vals, th)), n_p)
+            if th in (0, 0.99):
+                self.assertEqual(numpy.unique(res).shape[0], 1)
+            else:
+                self.assertEqual(numpy.unique(res).shape[0], 2)
+
+        vals = numpy.arange(10) / 10
+        old_vals = vals.copy()
+        for th, n_p in zip(thresholds, n_pos):
+            res = binarize(vals, th, inplace=False)
+            self.assertEqual(sum(res), n_p)
+            self.assertTrue(numpy.array_equal(old_vals, vals))
+            if th in (0, 0.99):
+                self.assertEqual(numpy.unique(res).shape[0], 1)
+            else:
+                self.assertEqual(numpy.unique(res).shape[0], 2)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_prepare_devices(self):
+        from sourced.ml.core.algorithms.id_splitter.nn_model import prepare_devices
+        correct_args = ["1", "0,1", "-1"]
+        resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")]
+        for res, arg in zip(resulted_dev, correct_args):
+            self.assertEqual(res, prepare_devices(arg))
+
+        bad_args = ["", "1,2,3"]
+        for arg in bad_args:
+            with self.assertRaises(ValueError):
+                prepare_devices(arg)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_build_schedule(self):
+        from sourced.ml.core.algorithms.id_splitter.pipeline import build_schedule
+        start_lr = 10
+        end_lr = 1
+        n_epochs = 9
+
+        lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs)
+
+        for i in range(n_epochs):
+            self.assertEqual(start_lr - i, lr_schedule(epoch=i))
+
+        with self.assertRaises(AssertionError):
+            lr_schedule(-1)
+        with self.assertRaises(AssertionError):
+            lr_schedule(n_epochs + 1)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_build_train_generator(self):
+        from sourced.ml.core.algorithms.id_splitter.pipeline import build_train_generator
+        batch_size = 3
+        # mismatch number of samples
+        bad_x = numpy.zeros(3)
+        bad_y = numpy.zeros(4)
+        with self.assertRaises(AssertionError):
+            build_train_generator(bad_x, bad_y, batch_size=batch_size)
+
+        # check generator with correct inputs
+        x = numpy.zeros(5)
+        gen = build_train_generator(x, x, batch_size=batch_size)
+        expected_n_samples = [3, 2]
+        for n_samples in expected_n_samples:
+            x_gen, y_gen = next(gen)
+            self.assertEqual(x_gen.shape, y_gen.shape)
+            self.assertEqual(n_samples, x_gen.shape[0])
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_train_parameters(self):
+        from sourced.ml.core.algorithms.id_splitter.pipeline import create_generator_params
+        batch_size = 500
+        samples_per_epoch = 10 ** 6
+        n_samples = 40 * 10 ** 6
+        epochs = 10
+
+        steps_per_epoch_ = samples_per_epoch // batch_size
+        n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch)
+
+        steps_per_epoch, n_epochs = create_generator_params(batch_size, samples_per_epoch,
+                                                            n_samples, epochs)
+        self.assertEqual(steps_per_epoch, steps_per_epoch_)
+        self.assertEqual(n_epochs, n_epochs_)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_config_keras(self):
+        from keras.backend.tensorflow_backend import get_session
+        from sourced.ml.core.algorithms.id_splitter.pipeline import config_keras
+        config_keras()
+        sess = get_session()
+        self.assertTrue(sess._config.gpu_options.allow_growth)
+
+    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
+    def test_prepare_callbacks(self):
+        from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint
+        from sourced.ml.core.algorithms.id_splitter.pipeline import prepare_callbacks
+        with tempfile.TemporaryDirectory() as tmpdir:
+            callbacks = prepare_callbacks(tmpdir)
+
+            # TensorBoard
+            self.assertIsInstance(callbacks[0], TensorBoard)
+            self.assertTrue(callbacks[0].log_dir.startswith(tmpdir))
+
+            # CSVLogger
+            self.assertIsInstance(callbacks[1], CSVLogger)
+            self.assertTrue(callbacks[1].filename.startswith(tmpdir))
+
+            # ModelCheckpoint
+            self.assertIsInstance(callbacks[2], ModelCheckpoint)
+            self.assertTrue(callbacks[2].filepath.startswith(tmpdir))
diff --git a/sourced/ml/core/tests/test_inttypes_to_nodes.py b/sourced/ml/core/tests/test_inttypes_to_nodes.py
new file mode 100644
index 0000000..998a7fd
--- /dev/null
+++ b/sourced/ml/core/tests/test_inttypes_to_nodes.py
@@ -0,0 +1,40 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import Uast2QuantizedChildren
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2NodesBagTest(unittest.TestCase):
+    def setUp(self):
+        self.nodes_bag_extractor = Uast2QuantizedChildren(npartitions=3)
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_uast_to_bag(self):
+        bag = self.nodes_bag_extractor(self.uast)
+        self.assertGreater(len(bag), 0, "Expected size of bag should be > 0")
+
+    def test_quantize_1(self):
+        freqs = {1: 100, 2: 90, 3: 10, 5: 10, 6: 5, 7: 5}
+        levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items())
+        self.assertEqual(list(levels), [1, 2, 3, 7])
+
+    def test_quantize_2(self):
+        freqs = {1: 10, 2: 10, 3: 10, 5: 10, 6: 10, 7: 10}
+        levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items())
+        self.assertEqual(list(levels), [1, 3, 6, 7])
+
+    def test_quantize_3(self):
+        freqs = {1: 100, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1}
+        levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items())
+        self.assertEqual(list(levels), [1, 2, 7, 7])
+
+    def test_quantize_4(self):
+        freqs = {1: 10, 2: 15, 3: 5, 5: 15, 6: 10, 7: 10}
+        levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items())
+        self.assertEqual(list(levels), [1, 2, 5, 7])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_merge_bow.py b/sourced/ml/core/tests/test_merge_bow.py
new file mode 100644
index 0000000..d59d73c
--- /dev/null
+++ b/sourced/ml/core/tests/test_merge_bow.py
@@ -0,0 +1,87 @@
+import os
+import tempfile
+import unittest
+
+import numpy
+from scipy.sparse import csc_matrix
+
+from sourced.ml.core.models import BOW
+from sourced.ml.core.models.model_converters.merge_bow import MergeBOW
+
+
+class MergeBOWTests(unittest.TestCase):
+    def setUp(self):
+        self.model1 = BOW() \
+            .construct(["doc_1", "doc_2", "doc_3"], ["f.tok_1", "k.tok_2", "f.tok_3"],
+                       csc_matrix((numpy.array([1, 2]),
+                                  (numpy.array([0, 1]), numpy.array([1, 0]))),
+                                  shape=(3, 3)))
+        self.model1._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]}
+        self.model2 = BOW() \
+            .construct(["doc_4", "doc_5", "doc_6"], ["f.tok_1", "k.tok_2", "f.tok_3"],
+                       csc_matrix((numpy.array([3, 4]),
+                                  (numpy.array([0, 1]), numpy.array([1, 0]))),
+                                  shape=(3, 3)))
+        self.model2._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]}
+        self.merge_results = [[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 3, 0], [4, 0, 0], [0, 0, 0]]
+        self.merge_bow = MergeBOW()
+
+    def test_convert_model_base(self):
+        self.merge_bow.convert_model(self.model1)
+        self.assertListEqual(self.merge_bow.documents, ["doc_1", "doc_2", "doc_3"])
+        self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"])
+        for i, row in enumerate(self.merge_bow.matrix[0].toarray()):
+            self.assertListEqual(list(row), self.merge_results[i])
+        self.assertEqual(self.merge_bow.deps, [{"uuid": "uuid", "model": "docfreq"}])
+        self.merge_bow.convert_model(self.model2)
+        self.assertListEqual(self.merge_bow.documents,
+                             ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"])
+        self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"])
+        for i, arr in enumerate(self.merge_bow.matrix):
+            for j, row in enumerate(arr.toarray()):
+                self.assertListEqual(list(row), self.merge_results[i * 3 + j])
+        self.assertEqual(self.merge_bow.deps, [{"model": "docfreq", "uuid": "uuid"}])
+
+    def test_convert_model_error(self):
+        self.merge_bow.convert_model(self.model1)
+        self.model2._tokens = ["f.tok_1", "k.tok_2"]
+        with self.assertRaises(ValueError):
+            self.merge_bow.convert_model(self.model2)
+        self.model2._tokens = ["f.tok_1", "k.tok_2", "f.tok_3", "f.tok_4"]
+        with self.assertRaises(ValueError):
+            self.merge_bow.convert_model(self.model2)
+
+    def test_finalize_base(self):
+        self.merge_bow.convert_model(self.model1)
+        self.merge_bow.convert_model(self.model2)
+        with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir:
+            dest = os.path.join(tmpdir, "bow.asdf")
+            self.merge_bow.finalize(0, dest)
+            bow = BOW().load(dest)
+            self.assertListEqual(bow.documents,
+                                 ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"])
+            self.assertListEqual(bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"])
+            for i, row in enumerate(bow.matrix.toarray()):
+                self.assertListEqual(list(row), self.merge_results[i])
+            self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}])
+
+    def test_finalize_reduce(self):
+        self.merge_bow.convert_model(self.model1)
+        self.merge_bow.features_namespaces = "f."
+        with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir:
+            dest = os.path.join(tmpdir, "bow.asdf")
+            self.merge_bow.finalize(0, dest)
+            bow = BOW().load(dest)
+            self.assertListEqual(bow.documents, ["doc_1", "doc_2", "doc_3"])
+            self.assertListEqual(bow.tokens, ["f.tok_1", "f.tok_3"])
+            for i, row in enumerate(bow.matrix.toarray()):
+                self.assertListEqual(list(row), self.merge_results[i][::2])
+            self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}])
+
+    def test_save_path(self):
+        self.assertEqual(self.merge_bow._save_path(0, "bow.asdf"), "bow.asdf")
+        self.assertEqual(self.merge_bow._save_path(0, "bow"), os.path.join("bow", "bow_0.asdf"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_merge_df.py b/sourced/ml/core/tests/test_merge_df.py
new file mode 100644
index 0000000..c2c536a
--- /dev/null
+++ b/sourced/ml/core/tests/test_merge_df.py
@@ -0,0 +1,40 @@
+import os
+import tempfile
+import unittest
+
+from sourced.ml.core.models import DocumentFrequencies
+from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq
+
+
+class Model2BaseTests(unittest.TestCase):
+    def setUp(self):
+        self.model1 = DocumentFrequencies().construct(3, {"one": 1, "two": 2,  "three": 3})
+        self.model2 = DocumentFrequencies().construct(3, {"four": 4, "three": 3, "five": 5})
+        self.merge_df = MergeDocFreq(min_docfreq=1, vocabulary_size=100)
+        self.merge_result = {"one": 1, "two": 2,  "three": 6, "four": 4, "five": 5}
+
+    def test_convert_model(self):
+        self.merge_df.convert_model(self.model1)
+        self.assertEqual(self.merge_df._docs, 3)
+        self.assertEqual(self.merge_df._df, self.model1._df)
+        self.merge_df.convert_model(self.model2)
+        self.assertEqual(self.merge_df._docs, 6)
+        self.assertEqual(self.merge_df._df, self.merge_result)
+
+    def test_finalize(self):
+        self.merge_df.convert_model(self.model1)
+        self.merge_df.convert_model(self.model2)
+        with tempfile.TemporaryDirectory(prefix="merge-df-") as tmpdir:
+            dest = os.path.join(tmpdir, "df.asdf")
+            self.merge_df.finalize(0, dest)
+            df = DocumentFrequencies().load(dest)
+            self.assertEqual(df.docs, 6)
+            self.assertEqual(df._df, self.merge_result)
+
+    def test_save_path(self):
+        self.assertEqual(self.merge_df._save_path(0, "df.asdf"), "df.asdf")
+        self.assertEqual(self.merge_df._save_path(0, "df"), os.path.join("df", "docfreq_0.asdf"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_model2base.py b/sourced/ml/core/tests/test_model2base.py
new file mode 100644
index 0000000..97c6625
--- /dev/null
+++ b/sourced/ml/core/tests/test_model2base.py
@@ -0,0 +1,99 @@
+import os
+import tempfile
+import unittest
+
+from sourced.ml.core.models.model_converters.base import Model2Base
+
+
+class FromModel:
+    NAME = "from"
+    meta = {"dependencies": ()}
+
+    def __init__(self, **kwargs):
+        pass
+
+    def load(self, source):
+        pass
+
+
+class ToModel:
+    NAME = "to"
+    output = None
+    meta = {"dependencies": ()}
+
+    def __init__(self, **kwargs):
+        pass
+
+    def save(self, output, deps=None):
+        ToModel.output = output
+
+
+class Model2Test(Model2Base):
+    MODEL_FROM_CLASS = FromModel
+    MODEL_TO_CLASS = ToModel
+    finalized = False
+
+    def convert_model(self, model):
+        return ToModel()
+
+
+class MockingModel2Test(Model2Base):
+    MODEL_FROM_CLASS = FromModel
+    MODEL_TO_CLASS = ToModel
+    finalized = False
+
+    def convert_model(self, model):
+        return ToModel()
+
+    def finalize(self, index: int, destdir: str):
+        self.finalized = True
+
+
+class RaisingModel2Test(Model2Base):
+    MODEL_FROM_CLASS = FromModel
+    MODEL_TO_CLASS = ToModel
+
+    def convert_model(self, model):
+        raise ValueError("happens")
+
+
+class FakeQueue:
+    def __init__(self, contents: list):
+        self.contents = contents
+
+    def get(self):
+        return self.contents.pop()
+
+    def put(self, item):
+        self.contents.append(item)
+
+
+class Model2BaseTests(unittest.TestCase):
+    def test_convert(self):
+        converter = Model2Test(num_processes=2)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            status = converter.convert(os.listdir(os.path.dirname(__file__)), tmpdir)
+            self.assertGreater(status, 20)
+
+    def test_process_entry(self):
+        converter = MockingModel2Test(num_processes=2)
+        queue_in = FakeQueue([None, "srcdir/job"])
+        queue_out = FakeQueue([])
+        with tempfile.TemporaryDirectory(prefix="sourced-ml-") as tmpdir:
+            converter._process_entry(
+                0, os.path.join(tmpdir, "destdir"), queue_in, queue_out)
+            self.assertTrue(os.path.exists(os.path.join(tmpdir, "destdir")))
+            self.assertEqual(ToModel.output, os.path.join(tmpdir, "destdir", "job"))
+        self.assertTrue(converter.finalized)
+        self.assertEqual(queue_out.contents, [("srcdir/job", True)])
+
+    def test_process_entry_exception(self):
+        converter = RaisingModel2Test(num_processes=2)
+        queue_in = FakeQueue([None, "srcdir/job"])
+        queue_out = FakeQueue([])
+        converter._process_entry(0, "destdir", queue_in, queue_out)
+        self.assertEqual(queue_out.contents, [("srcdir/job", False)])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_pickleable_logger.py b/sourced/ml/core/tests/test_pickleable_logger.py
new file mode 100644
index 0000000..e52b436
--- /dev/null
+++ b/sourced/ml/core/tests/test_pickleable_logger.py
@@ -0,0 +1,22 @@
+import logging
+import pickle
+import unittest
+
+from sourced.ml.core.utils.pickleable_logger import PickleableLogger
+
+
+class TestLogger(PickleableLogger):
+    def _get_log_name(self):
+        return "test"
+
+
+class PickleableLoggerTests(unittest.TestCase):
+    def test_pickle(self):
+        logger = TestLogger(log_level=logging.ERROR)
+        logger = pickle._loads(pickle._dumps(logger))
+        self.assertIsInstance(logger._log, logging.Logger)
+        self.assertEqual(logger._log.level, logging.ERROR)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_projector.py b/sourced/ml/core/tests/test_projector.py
new file mode 100644
index 0000000..0c36b60
--- /dev/null
+++ b/sourced/ml/core/tests/test_projector.py
@@ -0,0 +1,127 @@
+import json
+import os
+import shutil
+import socket
+import tempfile
+import time
+import unittest
+
+from modelforge import slogging
+import requests
+
+from sourced.ml.core.tests.test_dump import captured_output
+from sourced.ml.core.utils.projector import CORSWebServer, present_embeddings, wait, web_server
+
+
+class ProjectorTests(unittest.TestCase):
+    MAX_ATTEMPTS = 40
+
+    @classmethod
+    def setUpClass(cls):
+        slogging.setup("DEBUG", False)
+
+    def setUp(self):
+        self.pwd = os.getcwd()
+
+    def tearDown(self):
+        os.chdir(self.pwd)
+
+    def wait_for_web_server(self):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            result = -1
+            attempts = 0
+            while result != 0 and attempts < self.MAX_ATTEMPTS:
+                time.sleep(0.05)
+                attempts += 1
+                result = sock.connect_ex(("0.0.0.0", 8000))
+        return attempts, result
+
+    def test_web_server(self):
+        with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir:
+            os.chdir(tmpdir)
+            testfile = "test.txt"
+            with open(testfile, "w") as fout:
+                fout.write("The Zen of Python, by Tim Peters")
+            server = CORSWebServer()
+            server.start()
+
+            try:
+                attempts, result = self.wait_for_web_server()
+                self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0)
+                self.assertEqual(requests.get("http://0.0.0.0:8000/test.txt").text,
+                                 "The Zen of Python, by Tim Peters")
+            finally:
+                server.stop()
+
+    def test_wait(self):
+        web_server.start()
+        try:
+            attempts, result = self.wait_for_web_server()
+            self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0)
+            self.assertTrue(web_server.running)
+        except:  # noqa
+            web_server.stop()
+            raise
+        os.environ["PROJECTOR_SERVER_TIME"] = "0"
+        wait()
+        self.assertFalse(web_server.running)
+        web_server.start()
+        try:
+            attempts, result = self.wait_for_web_server()
+            self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0)
+            self.assertTrue(web_server.running)
+        finally:
+            web_server.stop()
+
+    def test_present_embeddings(self):
+        with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir:
+            tmpdir = os.path.join(tmpdir, "1", "2")
+            present_embeddings(tmpdir, False, ["one", "two"],
+                               [(str(i), "x") for i in range(5)],
+                               [(i, i) for i in range(5)])
+            with open(os.path.join(tmpdir, "id2vec.json")) as fin:
+                json.load(fin)
+            with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin:
+                self.assertEqual(fin.read(), "one\ttwo\n0\tx\n1\tx\n2\tx\n3\tx\n4\tx\n")
+            with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin:
+                self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n")
+
+    def test_present_embeddings_run_server(self):
+        def sweded_which(prog):
+            return None
+
+        which = shutil.which
+        shutil.which = sweded_which
+        browser = os.getenv("BROWSER", "")
+        os.environ["BROWSER"] = ""
+
+        try:
+            with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir:
+                with captured_output() as (stdout, _, _):
+                    present_embeddings(tmpdir, True, ["one"],
+                                       [str(i) for i in range(5)],
+                                       [(i, i) for i in range(5)])
+                    with open(os.path.join(tmpdir, "id2vec.json")) as fin:
+                        json.load(fin)
+                    with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin:
+                        self.assertEqual(fin.read(), "0\n1\n2\n3\n4\n")
+                    with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin:
+                        self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n")
+                self.assertIn(
+                    "\thttp://projector.tensorflow.org/?config=http://0.0.0.0:8000/id2vec.json\n",
+                    stdout.getvalue())
+        finally:
+            shutil.which = which
+            os.environ["BROWSER"] = browser
+            web_server.stop()
+
+    def test_stop(self):
+        web_server.stop()  # dummy test to avoid partially covered line in CI
+        self.assertFalse(web_server.running)
+        web_server.start()
+        web_server.stop()
+        self.assertFalse(web_server.running)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_quant.py b/sourced/ml/core/tests/test_quant.py
new file mode 100644
index 0000000..f2decf9
--- /dev/null
+++ b/sourced/ml/core/tests/test_quant.py
@@ -0,0 +1,49 @@
+from io import BytesIO
+import unittest
+
+import numpy
+
+from sourced.ml.core.models import QuantizationLevels
+import sourced.ml.core.tests.models as paths
+
+
+class QuantizationLevelsTests(unittest.TestCase):
+    def setUp(self):
+        self.model = QuantizationLevels().load(source=paths.QUANTLEVELS)
+
+    def test_levels(self):
+        levels = self.model.levels
+        self.assertIsInstance(levels, dict)
+        self.assertEqual(len(levels), 1)
+        self.assertIsInstance(levels["children"], dict)
+        self.assertEqual(len(levels["children"]), 259)
+
+    def test_len(self):
+        self.assertEqual(len(self.model), 1)
+
+    def test_write(self):
+        levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]),
+                          "c": numpy.array([7, 8, 9])},
+                  "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]),
+                          "e": numpy.array([9, 8, 7])}}
+        buffer = BytesIO()
+        QuantizationLevels().construct(levels).save(output=buffer, series="quant")
+        buffer.seek(0)
+        model = QuantizationLevels().load(buffer)
+        levels = model.levels
+        self.assertEqual(len(levels), 2)
+        self.assertEqual(len(levels["xxx"]), 3)
+        self.assertEqual(len(levels["yyy"]), 3)
+        self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all())
+        self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all())
+        self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all())
+        self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all())
+        self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all())
+        self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all())
+
+    def test_dump(self):
+        self.assertEqual(self.model.dump(), "Schemes: [('children', '259@10')]")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_random_walk.py b/sourced/ml/core/tests/test_random_walk.py
new file mode 100644
index 0000000..1827589
--- /dev/null
+++ b/sourced/ml/core/tests/test_random_walk.py
@@ -0,0 +1,29 @@
+import unittest
+
+import bblfsh
+
+from sourced.ml.core.algorithms.uast_ids_to_bag import FakeVocabulary
+from sourced.ml.core.algorithms.uast_struct_to_bag import Uast2RandomWalks
+from sourced.ml.core.tests import models
+
+
+class RandomWalkTests(unittest.TestCase):
+    def setUp(self):
+        self.bblfsh = bblfsh.BblfshClient("localhost:9432")
+        self.uast = self.bblfsh.parse(models.SOURCE_PY).uast
+        self.uast2walk = Uast2RandomWalks(p_explore_neighborhood=0.5,
+                                          q_leave_neighborhood=0.5,
+                                          n_walks=5,
+                                          n_steps=19,
+                                          node2index=FakeVocabulary(),
+                                          seed=42)
+
+    def test_rw(self):
+        for walk in self.uast2walk(self.uast):
+            for i in range(len(walk)-1):
+                self.assertNotEqual(walk[i], walk[i+1],
+                                    "Two neighbours nodes should not be the same")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_token_parser.py b/sourced/ml/core/tests/test_token_parser.py
new file mode 100644
index 0000000..5dab621
--- /dev/null
+++ b/sourced/ml/core/tests/test_token_parser.py
@@ -0,0 +1,173 @@
+import pickle
+import unittest
+
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser
+
+
+class TokenParserTests(unittest.TestCase):
+    def setUp(self):
+        self.tp = TokenParser(stem_threshold=4, max_token_length=20)
+        self.tp._single_shot = False
+
+    def test_process_token(self):
+        self.tp.max_token_length = 100
+
+        tokens = [
+            ("UpperCamelCase", ["upper", "camel", "case"]),
+            ("camelCase", ["camel", "case"]),
+            ("FRAPScase", ["frap", "case"]),
+            ("SQLThing", ["sqlt", "hing"]),
+            ("_Astra", ["astra"]),
+            ("CAPS_CONST", ["caps", "const"]),
+            ("_something_SILLY_", ["someth", "silli"]),
+            ("blink182", ["blink"]),
+            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
+            ("Man45var", ["man", "var"]),
+            ("method_name", ["method", "name"]),
+            ("Method_Name", ["method", "name"]),
+            ("101dalms", ["dalm"]),
+            ("101_dalms", ["dalm"]),
+            ("101_DalmsBug", ["dalm", "bug"]),
+            ("101_Dalms45Bug7", ["dalm", "bug"]),
+            ("wdSize", ["wd", "size", "wdsize"]),
+            ("Glint", ["glint"]),
+            ("foo_BAR", ["foo", "bar"]),
+            ("sourced.ml.algorithms.uast_ids_to_bag",
+             ["sourc", "sourcedml", "algorithm", "mlalgorithm",
+              "uast", "ids", "idsto", "bag", "tobag"]),
+            ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]),
+            # Another bad example. Parser failed to parse it correctly
+            ("SmallIdsToFoOo", ["small", "ids", "idsto", "fo", "oo"]),
+            ("SmallIdFooo", ["small", "smallid", "fooo", "idfooo"]),
+            ("ONE_M0re_.__badId.example", ["one", "onem", "re", "bad", "rebad",
+                                           "badid", "exampl", "idexampl"]),
+            ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]),
+            ("a.b.c.d", ["a", "b", "c", "d"]),
+            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
+            ("looong_sh_loooong_sh", ["looong", "looongsh", "loooong", "shloooong", "loooongsh"]),
+            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
+            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
+        ]
+
+        for token, correct in tokens:
+            res = list(self.tp.process_token(token))
+            self.assertEqual(res, correct)
+
+    def test_process_token_single_shot(self):
+        self.tp.max_token_length = 100
+        self.tp._single_shot = True
+        self.tp.min_split_length = 1
+        tokens = [
+            ("UpperCamelCase", ["upper", "camel", "case"]),
+            ("camelCase", ["camel", "case"]),
+            ("FRAPScase", ["frap", "case"]),
+            ("SQLThing", ["sqlt", "hing"]),
+            ("_Astra", ["astra"]),
+            ("CAPS_CONST", ["caps", "const"]),
+            ("_something_SILLY_", ["someth", "silli"]),
+            ("blink182", ["blink"]),
+            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
+            ("Man45var", ["man", "var"]),
+            ("method_name", ["method", "name"]),
+            ("Method_Name", ["method", "name"]),
+            ("101dalms", ["dalm"]),
+            ("101_dalms", ["dalm"]),
+            ("101_DalmsBug", ["dalm", "bug"]),
+            ("101_Dalms45Bug7", ["dalm", "bug"]),
+            ("wdSize", ["wd", "size"]),
+            ("Glint", ["glint"]),
+            ("foo_BAR", ["foo", "bar"]),
+            ("sourced.ml.algorithms.uast_ids_to_bag",
+             ["sourc", "ml", "algorithm", "uast", "ids", "to", "bag"]),
+            ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]),
+            # Another bad example. Parser failed to parse it correctly
+            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
+            ("SmallIdFooo", ["small", "id", "fooo"]),
+            ("ONE_M0re_.__badId.example", ["one", "m", "re", "bad", "id", "exampl"]),
+            ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]),
+            ("a.b.c.d", ["a", "b", "c", "d"]),
+            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
+            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
+            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
+            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
+        ]
+
+        for token, correct in tokens:
+            res = list(self.tp.process_token(token))
+            self.assertEqual(res, correct)
+
+        min_split_length = 3
+        self.tp.min_split_length = min_split_length
+        for token, correct in tokens:
+            res = list(self.tp.process_token(token))
+            self.assertEqual(res, [c for c in correct if len(c) >= min_split_length])
+
+    def test_split(self):
+        self.assertEqual(list(self.tp.split("set for")), ["set", "for"])
+        self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"])
+        self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"])
+        self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"])
+        self.assertEqual(list(self.tp.split("PrintAllExcept")), ["print", "all", "except"])
+        self.assertEqual(
+            list(self.tp.split("print really long line")),
+            # 'longli' is expected artifact due to edge effects
+            ["print", "really", "long", "longli"])
+        self.assertEqual(
+            list(self.tp.split("set /for. *&PrintAll")),
+            ["set", "for", "print", "all"])
+        self.assertEqual(
+            list(self.tp.split("JumpDown not Here")),
+            ["jump", "down", "not", "here"])
+
+        self.assertEqual(
+            list(self.tp.split("a b c d")),
+            ["a", "b", "c", "d"])
+        self.assertEqual(
+            list(self.tp.split("a b long c d")),
+            ["a", "b", "long", "blong", "longc", "d"])
+        self.assertEqual(
+            list(self.tp.split("AbCd")),
+            ["ab", "cd"])
+
+    def test_split_single_shot(self):
+        self.tp._single_shot = True
+        self.tp.min_split_length = 1
+        self.assertEqual(
+            list(self.tp.split("print really long line")),
+            # 'longli' is expected artifact due to edge effects
+            ["print", "really", "long", "li"])
+        self.assertEqual(
+            list(self.tp.split("a b c d")),
+            ["a", "b", "c", "d"])
+        self.assertEqual(
+            list(self.tp.split("a b long c d")),
+            ["a", "b", "long", "c", "d"])
+        self.assertEqual(
+            list(self.tp.split("AbCd")),
+            ["ab", "cd"])
+
+    def test_stem(self):
+        self.assertEqual(self.tp.stem("lol"), "lol")
+        self.assertEqual(self.tp.stem("apple"), "appl")
+        self.assertEqual(self.tp.stem("orange"), "orang")
+        self.assertEqual(self.tp.stem("embedding"), "embed")
+        self.assertEqual(self.tp.stem("Alfred"), "Alfred")
+        self.assertEqual(self.tp.stem("Pluto"), "Pluto")
+
+    def test_pickle(self):
+        tp = pickle.loads(pickle.dumps(self.tp))
+        self.assertEqual(tp.stem("embedding"), "embed")
+
+
+class NoopTokenParserTests(unittest.TestCase):
+    def setUp(self):
+        self.tp = NoopTokenParser()
+
+    def test_process_token(self):
+        self.assertEqual(list(self.tp.process_token("abcdef")), ["abcdef"])
+        self.assertEqual(list(self.tp.process_token("abcd_ef")), ["abcd_ef"])
+        self.assertEqual(list(self.tp.process_token("abcDef")), ["abcDef"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py
new file mode 100644
index 0000000..b360683
--- /dev/null
+++ b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py
@@ -0,0 +1,20 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import Uast2GraphletBag
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2GraphletBagTest(unittest.TestCase):
+    def setUp(self):
+        self.graphlets_bag_extractor = Uast2GraphletBag()
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_uast_to_bag(self):
+        bag = self.graphlets_bag_extractor(self.uast)
+        self.assertGreater(len(bag), 0, "Expected size of bag should be > 0")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_uast_struct_to_bag.py b/sourced/ml/core/tests/test_uast_struct_to_bag.py
new file mode 100644
index 0000000..d6590d4
--- /dev/null
+++ b/sourced/ml/core/tests/test_uast_struct_to_bag.py
@@ -0,0 +1,55 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import UastRandomWalk2Bag, UastSeq2Bag
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2RandomWalk2BagTest(unittest.TestCase):
+    def setUp(self):
+        self.uast_random_walk2bag = UastRandomWalk2Bag(seq_len=[2, 3])
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_uast_to_bag(self):
+        bag = self.uast_random_walk2bag(self.uast)
+        self.assertGreater(len(bag), 0, "Expected size of bag should be > 0")
+
+    def test_equivalence_prepare_starting_nodes(self):
+        starting_nodes_old = self.prepare_starting_nodes(self.uast)
+        starting_nodes = self.uast_random_walk2bag.uast2walks.prepare_starting_nodes(self.uast)
+        self.assertEqual(len(starting_nodes_old), len(starting_nodes))
+
+        def structure(tree):
+            from collections import Counter
+            return set(Counter(len(node.children) for node in tree))
+
+        self.assertEqual(structure(starting_nodes_old), structure(starting_nodes))
+
+    def prepare_starting_nodes(self, uast):
+        starting_nodes = []
+        self._prepare_starting_nodes(uast, None, starting_nodes)
+
+        return starting_nodes
+
+    def _prepare_starting_nodes(self, root, parent, starting_nodes):
+        node = self.uast_random_walk2bag.uast2walks._extract_node(node=root, parent=parent)
+        starting_nodes.append(node)
+
+        for ch in root.children:
+            node.children.append(self._prepare_starting_nodes(
+                ch, parent=node, starting_nodes=starting_nodes))
+
+
+class UastSeq2BagTest(unittest.TestCase):
+    def setUp(self):
+        self.uast_seq2bag = UastSeq2Bag(seq_len=[2, 3])
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_uast_to_bag(self):
+        bag = self.uast_seq2bag(self.uast)
+        self.assertGreater(len(bag), 0, "Expected size of bag should be > 0")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_uast_to_id_distance.py b/sourced/ml/core/tests/test_uast_to_id_distance.py
new file mode 100644
index 0000000..d3f5d56
--- /dev/null
+++ b/sourced/ml/core/tests/test_uast_to_id_distance.py
@@ -0,0 +1,138 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import Uast2IdLineDistance, Uast2IdTreeDistance
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2IdTreeDistanceTest(unittest.TestCase):
+    def setUp(self):
+        self.uast2role_id_pairs = Uast2IdTreeDistance(token_parser=NoopTokenParser(),
+                                                      max_distance=4)
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+        self.maxDiff = None
+
+    def test_result(self):
+        correct = [(("__spec__", "ModuleSpec"), 2),
+                   (("__spec__", "ModuleSpec"), 3),
+                   (("__spec__", "ModuleSpec"), 3),
+                   (("collections", "ModuleSpec"), 2),
+                   (("collections", "ModuleSpec"), 2),
+                   (("collections", "ModuleSpec"), 3),
+                   (("collections", "__spec__"), 3),
+                   (("namedtuple", "ModuleSpec"), 3),
+                   (("namedtuple", "ModuleSpec"), 3),
+                   (("namedtuple", "ModuleSpec"), 3),
+                   (("namedtuple", "ModuleSpec"), 3),
+                   (("namedtuple", "collections"), 3),
+                   (("namedtuple", "collections"), 3),
+                   (("setup", "modelforge.logs"), 3),
+                   (("setup_logging", "modelforge.logs"), 3),
+                   (("sys", "modelforge.logs"), 3),
+                   (("sys", "modules"), 2),
+                   (("utmain", "ModuleSpec"), 2),
+                   (("utmain", "ModuleSpec"), 3),
+                   (("utmain", "ModuleSpec"), 3),
+                   (("utmain", "__package__"), 2),
+                   (("utmain", "__spec__"), 2),
+                   (("utmain", "__spec__"), 2),
+                   (("utmain", "collections"), 3),
+                   (("utmain", "modelforge.logs"), 2),
+                   (("utmain", "modelforge.logs"), 2),
+                   (("utmain", "setup"), 3),
+                   (("utmain", "setup"), 3),
+                   (("utmain", "setup_logging"), 3),
+                   (("utmain", "setup_logging"), 3),
+                   (("utmain", "sys"), 3),
+                   (("utmain", "sys"), 3)]
+
+        res = sorted(self.uast2role_id_pairs(self.uast))
+        self.assertEqual(res, correct)
+
+
+class Uast2IdLineDistanceTest(unittest.TestCase):
+    def setUp(self):
+        self.uast2role_id_pairs = Uast2IdLineDistance(token_parser=NoopTokenParser(),
+                                                      max_distance=3)
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+        self.maxDiff = None
+
+    def test_result(self):
+        correct = [(("__package__", "ModuleSpec"), 2),
+                   (("__spec__", "ModuleSpec"), 0),
+                   (("__spec__", "ModuleSpec"), 1),
+                   (("__spec__", "ModuleSpec"), 1),
+                   (("__spec__", "ModuleSpec"), 2),
+                   (("__spec__", "__package__"), 0),
+                   (("collections", "ModuleSpec"), 1),
+                   (("collections", "ModuleSpec"), 2),
+                   (("collections", "__package__"), 1),
+                   (("collections", "__spec__"), 1),
+                   (("collections", "__spec__"), 2),
+                   (("modules", "__package__"), 1),
+                   (("modules", "__spec__"), 1),
+                   (("modules", "collections"), 2),
+                   (("namedtuple", "ModuleSpec"), 0),
+                   (("namedtuple", "ModuleSpec"), 1),
+                   (("namedtuple", "ModuleSpec"), 1),
+                   (("namedtuple", "ModuleSpec"), 2),
+                   (("namedtuple", "ModuleSpec"), 2),
+                   (("namedtuple", "__package__"), 1),
+                   (("namedtuple", "__package__"), 2),
+                   (("namedtuple", "__spec__"), 1),
+                   (("namedtuple", "__spec__"), 1),
+                   (("namedtuple", "__spec__"), 2),
+                   (("namedtuple", "__spec__"), 2),
+                   (("namedtuple", "collections"), 0),
+                   (("namedtuple", "collections"), 1),
+                   (("namedtuple", "modules"), 2),
+                   (("setup_logging", "modelforge.logs"), 0),
+                   (("setup_logging", "setup"), 1),
+                   (("sys", "__package__"), 1),
+                   (("sys", "__spec__"), 1),
+                   (("sys", "collections"), 2),
+                   (("sys", "modelforge.logs"), 2),
+                   (("sys", "modules"), 0),
+                   (("sys", "namedtuple"), 2),
+                   (("sys", "setup_logging"), 2),
+                   (("utmain", "ModuleSpec"), 0),
+                   (("utmain", "ModuleSpec"), 1),
+                   (("utmain", "ModuleSpec"), 1),
+                   (("utmain", "ModuleSpec"), 1),
+                   (("utmain", "ModuleSpec"), 2),
+                   (("utmain", "ModuleSpec"), 2),
+                   (("utmain", "ModuleSpec"), 2),
+                   (("utmain", "__package__"), 0),
+                   (("utmain", "__package__"), 0),
+                   (("utmain", "__package__"), 1),
+                   (("utmain", "__spec__"), 0),
+                   (("utmain", "__spec__"), 0),
+                   (("utmain", "__spec__"), 0),
+                   (("utmain", "__spec__"), 1),
+                   (("utmain", "__spec__"), 2),
+                   (("utmain", "collections"), 1),
+                   (("utmain", "collections"), 1),
+                   (("utmain", "collections"), 2),
+                   (("utmain", "collections"), 2),
+                   (("utmain", "modules"), 0),
+                   (("utmain", "modules"), 1),
+                   (("utmain", "modules"), 1),
+                   (("utmain", "namedtuple"), 1),
+                   (("utmain", "namedtuple"), 1),
+                   (("utmain", "namedtuple"), 1),
+                   (("utmain", "namedtuple"), 2),
+                   (("utmain", "namedtuple"), 2),
+                   (("utmain", "namedtuple"), 2),
+                   (("utmain", "namedtuple"), 2),
+                   (("utmain", "sys"), 0),
+                   (("utmain", "sys"), 1),
+                   (("utmain", "sys"), 1)]
+
+        res = sorted(self.uast2role_id_pairs(self.uast))
+        self.assertEqual(res, correct)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_uast_to_id_sequence.py b/sourced/ml/core/tests/test_uast_to_id_sequence.py
new file mode 100644
index 0000000..ffce913
--- /dev/null
+++ b/sourced/ml/core/tests/test_uast_to_id_sequence.py
@@ -0,0 +1,25 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import Uast2IdSequence
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2IdSequenceTest(unittest.TestCase):
+    def setUp(self):
+        self.uast2id_sequence = Uast2IdSequence(token_parser=NoopTokenParser())
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_result(self):
+        correct = ["sys", "setup_logging", "modelforge.logs", "utmain", "modules", "sys",
+                   "__package__", "utmain",  "__spec__", "utmain",  "namedtuple", "collections",
+                   "ModuleSpec", "namedtuple", "__spec__", "utmain", "ModuleSpec", "ModuleSpec",
+                   "utmain", "setup", "setup_logging"]
+        res = self.uast2id_sequence(self.uast)
+        self.assertEqual(res, self.uast2id_sequence.concat(correct))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/tests/test_uast_to_role_id_pairs.py b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py
new file mode 100644
index 0000000..6ec6b08
--- /dev/null
+++ b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py
@@ -0,0 +1,42 @@
+import unittest
+
+from bblfsh import BblfshClient
+
+from sourced.ml.core.algorithms import Uast2RoleIdPairs
+from sourced.ml.core.algorithms.token_parser import NoopTokenParser
+from sourced.ml.core.tests.models import SOURCE_PY
+
+
+class Uast2NodesBagTest(unittest.TestCase):
+    def setUp(self):
+        self.uast2role_id_pairs = Uast2RoleIdPairs(token_parser=NoopTokenParser())
+        self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast
+
+    def test_result(self):
+        correct = [("ModuleSpec", "BODY | IF | THEN"),
+                   ("ModuleSpec", "IDENTIFIER | EXPRESSION | CALL | CALLEE"),
+                   ("ModuleSpec", "STATEMENT | INCOMPLETE"),
+                   ("__package__", "BINARY | EXPRESSION | CONDITION"),
+                   ("__spec__", "BINARY | EXPRESSION | CONDITION"),
+                   ("__spec__", "BODY | IF | THEN"),
+                   ("collections", "IDENTIFIER | IMPORT | PATHNAME"),
+                   ("modelforge.logs", "IDENTIFIER | IMPORT | PATHNAME"),
+                   ("modules", "RIGHT | EXPRESSION | INCOMPLETE"),
+                   ("namedtuple", "IDENTIFIER | EXPRESSION | CALL | CALLEE"),
+                   ("namedtuple", "IDENTIFIER | IMPORT | PATHNAME"),
+                   ("setup", "IDENTIFIER | DECLARATION | FUNCTION | NAME"),
+                   ("setup_logging", "IDENTIFIER | EXPRESSION | CALL | CALLEE"),
+                   ("setup_logging", "IDENTIFIER | IMPORT | PATHNAME"),
+                   ("sys", "IDENTIFIER | IMPORT | PATHNAME"),
+                   ("sys", "RIGHT | EXPRESSION | INCOMPLETE"),
+                   ("utmain", "BINARY | EXPRESSION | CONDITION"),
+                   ("utmain", "BINARY | EXPRESSION | CONDITION"),
+                   ("utmain", "BODY | IF | THEN"),
+                   ("utmain", "FILE | MODULE"),
+                   ("utmain", "STATEMENT | INCOMPLETE")]
+        res = sorted(self.uast2role_id_pairs(self.uast))
+        self.assertEqual(res, correct)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sourced/ml/core/utils/__init__.py b/sourced/ml/core/utils/__init__.py
new file mode 100644
index 0000000..41754f0
--- /dev/null
+++ b/sourced/ml/core/utils/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from sourced.ml.core.utils.bigartm import install_bigartm
+from sourced.ml.core.utils.pickleable_logger import PickleableLogger
diff --git a/sourced/ml/core/utils/bblfsh.py b/sourced/ml/core/utils/bblfsh.py
new file mode 100644
index 0000000..12b111a
--- /dev/null
+++ b/sourced/ml/core/utils/bblfsh.py
@@ -0,0 +1,19 @@
+from distutils.version import StrictVersion
+
+from bblfsh.client import BblfshClient
+
+BBLFSH_VERSION_LOW = "2.2"
+BBLFSH_VERSION_HIGH = "3.0"
+
+
+def check_version(host: str = "0.0.0.0", port: str = "9432") -> bool:
+    """
+    Check if the bblfsh server version matches module requirements.
+
+    :param host: bblfsh server host
+    :param port: bblfsh server port
+    :return: True if bblfsh version specified matches requirements
+    """
+    # get version and remove leading 'v'
+    version = StrictVersion(BblfshClient("%s:%s" % (host, port)).version().version.lstrip("v"))
+    return StrictVersion(BBLFSH_VERSION_LOW) <= version < StrictVersion(BBLFSH_VERSION_HIGH)
diff --git a/sourced/ml/core/utils/bblfsh_roles.py b/sourced/ml/core/utils/bblfsh_roles.py
new file mode 100644
index 0000000..22f8569
--- /dev/null
+++ b/sourced/ml/core/utils/bblfsh_roles.py
@@ -0,0 +1,14 @@
+import bblfsh
+
+
+IDENTIFIER = bblfsh.role_id("IDENTIFIER")
+QUALIFIED = bblfsh.role_id("QUALIFIED")
+LITERAL = bblfsh.role_id("LITERAL")
+OPERATOR = bblfsh.role_id("OPERATOR")
+EXPRESSION = bblfsh.role_id("EXPRESSION")
+LEFT = bblfsh.role_id("LEFT")
+BINARY = bblfsh.role_id("BINARY")
+ASSIGNMENT = bblfsh.role_id("ASSIGNMENT")
+FUNCTION = bblfsh.role_id("FUNCTION")
+DECLARATION = bblfsh.role_id("DECLARATION")
+NAME = bblfsh.role_id("NAME")
diff --git a/sourced/ml/core/utils/bigartm.py b/sourced/ml/core/utils/bigartm.py
new file mode 100644
index 0000000..26e78e7
--- /dev/null
+++ b/sourced/ml/core/utils/bigartm.py
@@ -0,0 +1,58 @@
+import glob
+import logging
+import multiprocessing
+import os
+import shutil
+import subprocess
+import tempfile
+
+
+def execute(cmd, cwd, log):
+    log.info(">>> %s", cmd)
+    parsed = [v for v in cmd.split(" ") if v]
+    subprocess.check_call(parsed, cwd=cwd)
+
+
+def install_bigartm(args=None, target="./bigartm", tempdir=None):
+    """
+    Deploys bigartm/bigartm at the specified path.
+
+    :param args: :class:`argparse.Namespace` with "output" and "tmpdir". \
+                 "output" sets the target directory, "tmpdir" sets \
+                 the temporary directory which is used to clone bigartm/bigartm \
+                 and build it.
+    :param target: The path to the built executable. If args is not None, it \
+                   becomes overridden.
+    :param tempdir: The temporary directory where to clone and build \
+                    bigartm/bigartm. If args is not None, it becomes overridden.
+    :return: None if successful; otherwise, the error code (can be 0!).
+    """
+    log = logging.getLogger("bigartm")
+    if args is not None:
+        tempdir = args.tmpdir
+        target = os.path.join(args.output, "bigartm")
+    if shutil.which(os.path.basename(target)) or shutil.which(target, path=os.getcwd()):
+        log.warning("bigartm is in the PATH, no-op.")
+        return 0
+    if not shutil.which("cmake"):
+        log.error("You need to install cmake.")
+        return 1
+    parent_dir = os.path.dirname(target)
+    os.makedirs(parent_dir, exist_ok=True)
+    if not os.path.isdir(parent_dir):
+        log.error("%s is not a directory.", parent_dir)
+        return 2
+    with tempfile.TemporaryDirectory(prefix="bigartm-", dir=tempdir) as tmpdir:
+        log.info("Building bigartm/bigartm in %s...", tmpdir)
+        execute("git clone --single-branch --depth=1 https://github.com/bigartm/bigartm .",
+                tmpdir, log)
+        cwd = os.path.join(tmpdir, "build")
+        os.mkdir(cwd)
+        execute("cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DPYTHON=python3 ..",
+                cwd, log)
+        execute("make -j%d" % multiprocessing.cpu_count(), cwd, log)
+        whl_path = glob.glob(os.path.join(tmpdir, "build/python/*.whl"))[0]
+        execute("pip3 install \"%s\"" % whl_path, cwd, log)
+        shutil.copyfile(os.path.join(cwd, "bin", "bigartm"), target)
+        os.chmod(target, 0o777)
+    log.info("Installed %s", os.path.abspath(target))
diff --git a/sourced/ml/core/utils/pickleable_logger.py b/sourced/ml/core/utils/pickleable_logger.py
new file mode 100644
index 0000000..31d5bd0
--- /dev/null
+++ b/sourced/ml/core/utils/pickleable_logger.py
@@ -0,0 +1,35 @@
+import logging
+
+
+class PickleableLogger:
+    """
+    Base class which provides the logging features through ``self._log``.
+
+    Can be safely pickled.
+    """
+
+    def __init__(self, log_level=logging.INFO):
+        """
+        Class constructor
+
+        :param log_level: logging level.
+        """
+        self._log = logging.getLogger(self._get_log_name())
+        self._log.setLevel(log_level)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_log"] = self._log.level
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        log_level = state["_log"]
+        self._log = logging.getLogger(self._get_log_name())
+        self._log.setLevel(log_level)
+
+    def _get_log_name(self):
+        """
+        Children must implement this method. It shall return the logger's name.
+        """
+        raise NotImplementedError
diff --git a/sourced/ml/core/utils/projector.py b/sourced/ml/core/utils/projector.py
new file mode 100644
index 0000000..77d747b
--- /dev/null
+++ b/sourced/ml/core/utils/projector.py
@@ -0,0 +1,108 @@
+from http.server import HTTPServer, SimpleHTTPRequestHandler, test
+import logging
+import os
+import shutil
+import threading
+import time
+
+
+class CORSWebServer:
+    def __init__(self):
+        self.thread = None
+        self.server = None
+
+    def serve(self):
+        outer = self
+
+        class ClojureServer(HTTPServer):
+            def __init__(self, *args, **kwargs):
+                HTTPServer.__init__(self, *args, **kwargs)
+                outer.server = self
+
+        class CORSRequestHandler(SimpleHTTPRequestHandler):
+            def end_headers(self):
+                self.send_header("Access-Control-Allow-Origin", "*")
+                SimpleHTTPRequestHandler.end_headers(self)
+
+        test(CORSRequestHandler, ClojureServer)
+
+    def start(self):
+        self.thread = threading.Thread(target=self.serve)
+        self.thread.start()
+
+    def stop(self):
+        if self.running:
+            self.server.shutdown()
+            self.server.server_close()
+            self.thread.join()
+            self.server = None
+            self.thread = None
+
+    @property
+    def running(self):
+        return self.server is not None
+
+
+web_server = CORSWebServer()
+
+
+def present_embeddings(destdir, run_server, labels, index, embeddings):
+    log = logging.getLogger("projector")
+    log.info("Writing Tensorflow Projector files...")
+    if not os.path.isdir(destdir):
+        os.makedirs(destdir)
+    os.chdir(destdir)
+    metaf = "id2vec_meta.tsv"
+    with open(metaf, "w") as fout:
+        if len(labels) > 1:
+            fout.write("\t".join(labels) + "\n")
+        for item in index:
+            if len(labels) > 1:
+                fout.write("\t".join(item) + "\n")
+            else:
+                fout.write(item + "\n")
+        log.info("Wrote %s", metaf)
+    dataf = "id2vec_data.tsv"
+    with open(dataf, "w") as fout:
+        for vec in embeddings:
+            fout.write("\t".join(str(v) for v in vec))
+            fout.write("\n")
+        log.info("Wrote %s", dataf)
+    jsonf = "id2vec.json"
+    with open(jsonf, "w") as fout:
+        fout.write("""{
+  "embeddings": [
+    {
+      "tensorName": "id2vec",
+      "tensorShape": [%s, %s],
+      "tensorPath": "http://0.0.0.0:8000/%s",
+      "metadataPath": "http://0.0.0.0:8000/%s"
+    }
+  ]
+}
+""" % (len(embeddings), len(embeddings[0]), dataf, metaf))
+    log.info("Wrote %s", jsonf)
+    if run_server and not web_server.running:
+        web_server.start()
+    url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
+    log.info(url)
+    if run_server:
+        if shutil.which("xdg-open") is not None:
+            os.system("xdg-open " + url)
+        else:
+            browser = os.getenv("BROWSER", "")
+            if browser:
+                os.system(browser + " " + url)
+            else:
+                print("\t" + url)
+
+
+def wait():
+    log = logging.getLogger("projector")
+    secs = int(os.getenv("PROJECTOR_SERVER_TIME", "60"))
+    log.info("Sleeping for %d seconds, safe to Ctrl-C" % secs)
+    try:
+        time.sleep(secs)
+    except KeyboardInterrupt:
+        pass
+    web_server.stop()