diff --git a/sourced/ml/core/algorithms/__init__.py b/sourced/ml/core/algorithms/__init__.py new file mode 100644 index 0000000..f457888 --- /dev/null +++ b/sourced/ml/core/algorithms/__init__.py @@ -0,0 +1,9 @@ +# flake8: noqa +from sourced.ml.core.algorithms.tf_idf import log_tf_log_idf +from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag, uast2sequence +from sourced.ml.core.algorithms.uast_struct_to_bag import UastRandomWalk2Bag, UastSeq2Bag +from sourced.ml.core.algorithms.uast_inttypes_to_nodes import Uast2QuantizedChildren +from sourced.ml.core.algorithms.uast_inttypes_to_graphlets import Uast2GraphletBag +from sourced.ml.core.algorithms.uast_to_role_id_pairs import Uast2RoleIdPairs +from sourced.ml.core.algorithms.uast_id_distance import Uast2IdLineDistance, Uast2IdTreeDistance +from sourced.ml.core.algorithms.uast_to_id_sequence import Uast2IdSequence diff --git a/sourced/ml/core/algorithms/id_embedding.py b/sourced/ml/core/algorithms/id_embedding.py new file mode 100644 index 0000000..2a8ac40 --- /dev/null +++ b/sourced/ml/core/algorithms/id_embedding.py @@ -0,0 +1,42 @@ +import numpy + + +def extract_coocc_matrix(global_shape, word_indices, model): + # Stage 1 - extract the tokens, map them to the global vocabulary + indices = [] + mapped_indices = [] + for i, w in enumerate(model.tokens): + gi = word_indices.get(w) + if gi is not None: + indices.append(i) + mapped_indices.append(gi) + indices = numpy.array(indices) + mapped_indices = numpy.array(mapped_indices) + # Stage 2 - sort the matched tokens by the index in the vocabulary + order = numpy.argsort(mapped_indices) + indices = indices[order] + mapped_indices = mapped_indices[order] + # Stage 3 - produce the csr_matrix with the matched tokens **only** + matrix = model.matrix.tocsr()[indices][:, indices] + # Stage 4 - convert this matrix to the global (ccmatrix) coordinates + csr_indices = matrix.indices + for i, v in enumerate(csr_indices): + # Here we use the fact that indices and mapped_indices are in the same order + csr_indices[i] = mapped_indices[v] + csr_indptr = matrix.indptr + new_indptr = [0] + for i, v in enumerate(mapped_indices): + prev_ptr = csr_indptr[i] + ptr = csr_indptr[i + 1] + + # Handle missing rows + prev = (mapped_indices[i - 1] + 1) if i > 0 else 0 + for _ in range(prev, v): + new_indptr.append(prev_ptr) + + new_indptr.append(ptr) + for _ in range(mapped_indices[-1] + 1, global_shape[0]): + new_indptr.append(csr_indptr[-1]) + matrix.indptr = numpy.array(new_indptr) + matrix._shape = global_shape + return matrix diff --git a/sourced/ml/core/algorithms/id_splitter/README.md b/sourced/ml/core/algorithms/id_splitter/README.md new file mode 100644 index 0000000..695225b --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/README.md @@ -0,0 +1,128 @@ +# Neural Identifier Splitter +Article [Splitting source code identifiers using Bidirectional LSTM Recurrent Neural Network](https://arxiv.org/abs/1805.11651). + +### Agenda +* Data +* Training pipeline +* How to launch + +### Data +You can download the dataset [here](https://drive.google.com/open?id=1wZR5zF1GL1fVcA1gZuAN_9rSLd5ssqKV). More information about the dataset is available [here](https://github.com/src-d/datasets/tree/master/Identifiers). +#### Data format +* format of file: `.csv.gz`. +* the `csv` structure: + +|num_files|num_occ|num_repos|token|token_split| +|:--|:--|:--|:--|:--| +|1|2|1|quesesSet|queses set| +|...|...|...|...|...| + +#### Data stats +* 49 millions of identifiers +* 1 GB + +### Training pipeline +Training pipeline consists of several steps +* [prepare features](https://github.com/src-d/ml/blob/master/sourced/ml/algorithms/id_splitter/features.py#L44-#L118) - read data, extract features, train/test split +* [prepare generators for keras](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L34-#L48) +* [prepare model - RNN or CNN](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L53-#L76) +* [training](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L78-#L89) +* [quality report and save the model](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L91-#L96) + +### How to launch +First of all you need to download data using link above. + +Usage: +```console +usage: srcml train-id-split [-h] -i INPUT [-e EPOCHS] [-b BATCH_SIZE] + [-l LENGTH] -o OUTPUT [-t TEST_RATIO] + [-p {pre,post}] [--optimizer {RMSprop,Adam}] + [--lr LR] [--final-lr FINAL_LR] + [--samples-before-report SAMPLES_BEFORE_REPORT] + [--val-batch-size VAL_BATCH_SIZE] [--seed SEED] + [--devices DEVICES] + [--csv-identifier CSV_IDENTIFIER] + [--csv-identifier-split CSV_IDENTIFIER_SPLIT] + [--include-csv-header] --model {RNN,CNN} + [-s STACK] + [--type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}] + [-n NEURONS] [-f FILTERS] [-k KERNEL_SIZES] + [--dim-reduction DIM_REDUCTION] + +optional arguments: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Path to the input data in CSV + format:num_files,num_occ,num_repos,token,token_split + -e EPOCHS, --epochs EPOCHS + Number of training epochs. The more the betterbut the + training time is proportional. (default: 10) + -b BATCH_SIZE, --batch-size BATCH_SIZE + Batch size. Higher values better utilize GPUsbut may + harm the convergence. (default: 500) + -l LENGTH, --length LENGTH + RNN sequence length. (default: 40) + -o OUTPUT, --output OUTPUT + Path to store the trained model. + -t TEST_RATIO, --test-ratio TEST_RATIO + Fraction of the dataset to use for evaluation. + (default: 0.2) + -p {pre,post}, --padding {pre,post} + Whether to pad before or after each sequence. + (default: post) + --optimizer {RMSprop,Adam} + Algorithm to use as an optimizer for the neural net. + (default: Adam) + --lr LR Initial learning rate. (default: 0.001) + --final-lr FINAL_LR Final learning rate. The decrease from the initial + learning rate is done linearly. (default: 1e-05) + --samples-before-report SAMPLES_BEFORE_REPORT + Number of samples between each validation reportand + training updates. (default: 5000000) + --val-batch-size VAL_BATCH_SIZE + Batch size for validation.It can be increased to speed + up the pipeline butit proportionally increases the + memory consumption. (default: 2000) + --seed SEED Random seed. (default: 1989) + --devices DEVICES Device(s) to use. '-1' means CPU. (default: 0) + --csv-identifier CSV_IDENTIFIER + Column name in the CSV file for the raw identifier. + (default: 3) + --csv-identifier-split CSV_IDENTIFIER_SPLIT + Column name in the CSV file for the splitidentifier. + (default: 4) + --include-csv-header Treat the first line of the input CSV as a + regularline. (default: False) + --model {RNN,CNN} Neural Network model to use to learn the + identifiersplitting task. + -s STACK, --stack STACK + Number of layers stacked on each other. (default: 2) + --type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU} + Recurrent layer type to use. (default: LSTM) + -n NEURONS, --neurons NEURONS + Number of neurons on each layer. (default: 256) + -f FILTERS, --filters FILTERS + Number of filters for each kernel size. (default: + 64,32,16,8) + -k KERNEL_SIZES, --kernel-sizes KERNEL_SIZES + Sizes for sliding windows. (default: 2,4,8,16) + --dim-reduction DIM_REDUCTION + Number of 1-d kernels to reduce dimensionalityafter + each layer. (default: 32) +``` + + +Examples of commands: +1) Train RNN with LSTM cells +```console +srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output +``` +2) Train RNN with CuDNNLSTM cells +```console +srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output \ +--type-cell CuDNNLSTM +``` +3) Train CNN +```console +srcml train-id-split --model CNN --input /path/to/input.csv.gz --output /path/to/output +``` diff --git a/sourced/ml/core/algorithms/id_splitter/__init__.py b/sourced/ml/core/algorithms/id_splitter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/algorithms/id_splitter/features.py b/sourced/ml/core/algorithms/id_splitter/features.py new file mode 100644 index 0000000..0333dd8 --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/features.py @@ -0,0 +1,118 @@ +import logging +import string +import tarfile +from typing import List, Tuple + +from modelforge.progress_bar import progress_bar +import numpy + + +def read_identifiers(csv_path: str, use_header: bool, max_identifier_len: int, identifier_col: int, + split_identifier_col: int, shuffle: bool = True) -> List[str]: + """ + Reads and filters too long identifiers in the CSV file. + + :param csv_path: path to the CSV file. + :param use_header: uses header as normal line (True) or treat as header line with column names. + :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer. + :param identifier_col: column name in the CSV file for the raw identifier. + :param split_identifier_col: column name in the CSV file for the split identifier lowercase. + :param shuffle: indicates whether to reorder the list of identifiers + at random after reading it. + :return: list of split identifiers. + """ + log = logging.getLogger("read_identifiers") + log.info("Reading data from the CSV file %s", csv_path) + identifiers = [] + # TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done + # Think about dataset download step + with tarfile.open(csv_path, encoding="utf-8") as f: + assert len(f.members) == 1, "One archived file is expected, got: %s" % len(f.members) + content = f.extractfile(f.members[0]) + if not use_header: + content.readline() + for line in progress_bar(content.readlines(), log): + row = line.decode("utf-8").strip().split(",") + if len(row[identifier_col]) <= max_identifier_len: + identifiers.append(row[split_identifier_col]) + if shuffle: + numpy.random.shuffle(identifiers) + log.info("Number of identifiers after filtering: %s." % len(identifiers)) + return identifiers + + +def prepare_features(csv_path: str, use_header: bool, max_identifier_len: int, + identifier_col: int, split_identifier_col: int, test_ratio: float, + padding: str, shuffle: bool = True) -> Tuple[numpy.array]: + """ + Prepare the features to train the identifier splitting task. + + :param csv_path: path to the CSV file. + :param use_header: uses header as normal line (True) or treat as header line with column names. + :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer. + :param identifier_col: column in the CSV file for the raw identifier. + :param split_identifier_col: column in the CSV file for the split identifier. + :param shuffle: indicates whether to reorder the list of identifiers + at random after reading it. + :param test_ratio: Proportion of test samples used for evaluation. + :param padding: position where to add padding values: + after the intput sequence if "post", before if "pre". + :return: training and testing features to train the neural net for the splitting task. + """ + from keras.preprocessing.sequence import pad_sequences + log = logging.getLogger("prepare_features") + + # read data from the input file + identifiers = read_identifiers(csv_path=csv_path, use_header=use_header, + max_identifier_len=max_identifier_len, + identifier_col=identifier_col, + split_identifier_col=split_identifier_col, shuffle=shuffle) + + log.info("Converting identifiers to character indices") + log.info("Number of identifiers: %d, Average length: %d characters" % + (len(identifiers), numpy.mean([len(i) for i in identifiers]))) + + char2ind = {c: i + 1 for i, c in enumerate(sorted(string.ascii_lowercase))} + + char_id_seq = [] + splits = [] + for identifier in identifiers: + # iterate through the identifier and convert to array of char indices & boolean split array + index_arr = [] + split_arr = [] + skip_char = False + for char in identifier.strip(): + if char in char2ind: + index_arr.append(char2ind[char]) + if skip_char: + skip_char = False + continue + split_arr.append(0) + elif char == " ": + split_arr.append(1) + skip_char = True + else: + log.warning("Unexpected symbol %s in identifier", char) + assert len(index_arr) == len(split_arr) + char_id_seq.append(index_arr) + splits.append(split_arr) + + log.info("Number of subtokens: %d, Number of distinct characters: %d" % + (sum(sum(split_arr) for split_arr in splits) + len(identifiers), + len({i for index_arr in char_id_seq for i in index_arr}))) + + log.info("Train/test splitting...") + n_train = int((1 - test_ratio) * len(char_id_seq)) + X_train = char_id_seq[:n_train] + X_test = char_id_seq[n_train:] + y_train = splits[:n_train] + y_test = splits[n_train:] + log.info("Number of train samples: %s, number of test samples: %s" % (len(X_train), + len(X_test))) + log.info("Padding the sequences...") + X_train = pad_sequences(X_train, maxlen=max_identifier_len, padding=padding) + X_test = pad_sequences(X_test, maxlen=max_identifier_len, padding=padding) + y_train = pad_sequences(y_train, maxlen=max_identifier_len, padding=padding) + y_test = pad_sequences(y_test, maxlen=max_identifier_len, padding=padding) + + return X_train, X_test, y_train[:, :, None], y_test[:, :, None] diff --git a/sourced/ml/core/algorithms/id_splitter/nn_model.py b/sourced/ml/core/algorithms/id_splitter/nn_model.py new file mode 100644 index 0000000..e917493 --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/nn_model.py @@ -0,0 +1,243 @@ +import string +from typing import Callable, List, Tuple, Union +import warnings + +import keras +from keras import backend as kbackend +from keras.layers import ( + BatchNormalization, Concatenate, Conv1D, Dense, Embedding, Input, TimeDistributed) +from keras.models import Model +import numpy +try: + import tensorflow as tf +except ImportError: + warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.") + + +LOSS = "binary_crossentropy" +METRICS = ["accuracy"] +# Number of unique characters and dimension of the embedding layer +NUM_CHARS = len(string.ascii_lowercase) + + +def register_metric(metric: Union[str, Callable]) -> Union[str, Callable]: + """ + Decorator function to register the metrics in the METRICS constant. + + :param metric: name of the tensorflow metric or custom function metric. + :return: the metric. + """ + assert isinstance(metric, str) or callable(metric) + METRICS.append(metric) + return metric + + +def prepare_devices(devices: str) -> Tuple[str]: + """ + Extract devices from arguments. + + :param devices: devices to use passed as one string argument. + :return: split devices. + """ + devices = devices.split(",") + if len(devices) == 2: + dev0, dev1 = ("/gpu:" + dev for dev in devices) + elif len(devices) == 1: + if int(devices[0]) != -1: + dev0 = dev1 = "/gpu:" + devices[0] + else: + dev0 = dev1 = "/cpu:0" + else: + raise ValueError("Expected 1 or 2 devices but got %d from the devices argument %s" % + (len(devices), devices)) + return dev0, dev1 + + +def prepare_input_emb(maxlen: int) -> Tuple[tf.Tensor]: + """ + Builds character embeddings, a dense representation of characters to feed the RNN with. + + :param maxlen: maximum length of the input sequence. + :return: input and one-hot character embedding layer. + """ + char_seq = Input((maxlen,)) + emb = Embedding(input_dim=NUM_CHARS + 1, output_dim=NUM_CHARS + 1, input_length=maxlen, + mask_zero=False, weights=[numpy.eye(NUM_CHARS + 1)], trainable=False)(char_seq) + return char_seq, emb + + +def add_output_layer(hidden_layer: tf.Tensor) -> keras.layers.wrappers.TimeDistributed: + """ + Applies a Dense layer to each of the timestamps of a hidden layer, independently. + The output layer has 1 sigmoid per character which predicts if there is a space or not + before the character. + + :param hidden_layer: hidden layer before the output layer. + :return: output layer. + """ + norm_input = BatchNormalization()(hidden_layer) + return TimeDistributed(Dense(1, activation="sigmoid"))(norm_input) + + +def add_rnn(X: tf.Tensor, units: int, rnn_layer: str, dev0: str = "/gpu:0", + dev1: str = "/gpu:1") -> tf.Tensor: + """ + Adds a bidirectional RNN layer with the specified parameters. + + :param X: input layer. + :param units: number of neurons in the output layer. + :param rnn_layer: type of cell in the RNN. + :param dev0: device that will be used as forward pass of RNN and concatenation. + :param dev1: device that will be used as backward pass. + :return: output bidirectional RNN layer. + """ + # select the type of RNN layer + rnn_layer = getattr(keras.layers, rnn_layer) + + # add the forward & backward RNN + with tf.device(dev0): + forward = rnn_layer(units=units, return_sequences=True)(X) + with tf.device(dev1): + backward = rnn_layer(units=units, return_sequences=True, go_backwards=True)(X) + + # concatenate + with tf.device(dev1): + bidi = Concatenate(axis=-1)([forward, backward]) + return bidi + + +def build_rnn(maxlen: int, units: int, stack: int, optimizer: str, dev0: str, + dev1: str, rnn_layer: str) -> keras.engine.training.Model: + """ + Builds a RNN model with the parameters specified as arguments. + + :param maxlen: maximum length of the input sequence. + :param units: number of neurons or dimensionality of the output RNN. + :param stack: number of RNN layers to stack. + :param optimizer: algorithm to use as an optimizer for the RNN. + :param rnn_layer: recurrent layer type to use. + :param dev0: first device to use when running specific operations. + :param dev1: second device to use when running specific operations. + :return: compiled RNN model. + """ + # prepare the model + with tf.device(dev0): + char_seq, hidden_layer = prepare_input_emb(maxlen) + + # stack the BiDi-RNN layers + for _ in range(stack): + hidden_layer = add_rnn(hidden_layer, units=units, rnn_layer=rnn_layer, + dev0=dev0, dev1=dev1) + output = add_output_layer(hidden_layer) + + # compile the model + model = Model(inputs=char_seq, outputs=output) + model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS) + return model + + +def add_conv(X: tf.Tensor, filters: List[int], kernel_sizes: List[int], + output_n_filters: int) -> tf.Tensor: + """ + Builds a single convolutional layer. + + :param X: input layer. + :param filters: number of output filters in the convolution. + :param kernel_sizes: list of lengths of the 1D convolution window. + :param output_n_filters: number of 1D output filters. + :return: output layer. + """ + # normalize the input + X = BatchNormalization()(X) + + # add convolutions + convs = [] + + for n_filters, kernel_size in zip(filters, kernel_sizes): + conv = Conv1D(filters=n_filters, kernel_size=kernel_size, padding="same", + activation="relu") + convs.append(conv(X)) + + # concatenate all convolutions + conc = Concatenate(axis=-1)(convs) + conc = BatchNormalization()(conc) + + # dimensionality reduction + conv = Conv1D(filters=output_n_filters, kernel_size=1, padding="same", activation="relu") + return conv(conc) + + +def build_cnn(maxlen: int, filters: List[int], output_n_filters: int, stack: int, + kernel_sizes: List[int], optimizer: str, device: str) -> keras.engine.training.Model: + """ + Builds a CNN model with the parameters specified as arguments. + + :param maxlen: maximum length of the input sequence. + :param filters: number of output filters in the convolution. + :param output_n_filters: number of 1d output filters. + :param stack: number of CNN layers to stack. + :param kernel_sizes: list of lengths of the 1D convolution window. + :param optimizer: algorithm to use as an optimizer for the CNN. + :param device: device to use when running specific operations. + :return: compiled CNN model. + """ + # prepare the model + with tf.device(device): + char_seq, hidden_layer = prepare_input_emb(maxlen) + + # stack the CNN layers + for _ in range(stack): + hidden_layer = add_conv(hidden_layer, filters=filters, kernel_sizes=kernel_sizes, + output_n_filters=output_n_filters) + output = add_output_layer(hidden_layer) + + # compile the model + model = Model(inputs=char_seq, outputs=output) + model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS) + return model + + +@register_metric +def precision(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the precision, a metric for multi-label classification of + how many selected items are relevant. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of precision. + """ + true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1))) + predicted_positives = kbackend.sum(kbackend.round(kbackend.clip(y_pred, 0, 1))) + precision = true_positives / (predicted_positives + kbackend.epsilon()) + return precision + + +@register_metric +def recall(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the recall, a metric for multi-label classification of + how many relevant items are selected. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of recall. + """ + true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1))) + possible_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true, 0, 1))) + recall = true_positives / (possible_positives + kbackend.epsilon()) + return recall + + +@register_metric +def f1score(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the F1 score, the harmonic average of precision and recall. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of F1 score. + """ + prec = precision(y_true, y_pred) + rec = recall(y_true, y_pred) + return 2 * prec * rec / (prec + rec + kbackend.epsilon()) diff --git a/sourced/ml/core/algorithms/id_splitter/pipeline.py b/sourced/ml/core/algorithms/id_splitter/pipeline.py new file mode 100644 index 0000000..debe27a --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/pipeline.py @@ -0,0 +1,222 @@ +from datetime import datetime +import logging +import os +import random +from typing import Callable, Iterable, List, Tuple +import warnings + +import keras +from keras import backend as kbackend +from keras.callbacks import CSVLogger, LearningRateScheduler, ModelCheckpoint, TensorBoard +import numpy +try: + import tensorflow as tf +except ImportError: + warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.") + + +# additional variable to avoid any division by zero when computing the precision and recall metrics +EPSILON = 10 ** -8 +# threshold that is used to binarize predictions of the model +DEFAULT_THRESHOLD = 0.5 + + +def set_random_seed(seed: int) -> None: + """ + Fixes a random seed for reproducibility. + + :param seed: seed value. + """ + numpy.random.seed(seed) + random.seed(seed) + tf.set_random_seed(seed) + + +def binarize(matrix: numpy.array, threshold: float, inplace: bool = True) -> numpy.array: + """ + Helper function to binarize a matrix. + + :param matrix: matrix as a numpy.array. + :param threshold: if value >= threshold then the value will be 1, else 0. + :param inplace: whether to modify the matrix inplace or not. + :return: the binarized matrix. + """ + mask = matrix >= threshold + if inplace: + matrix_ = matrix + else: + matrix_ = matrix.copy() + matrix_[mask] = 1 + matrix_[numpy.logical_not(mask)] = 0 + return matrix_ + + +def str2ints(params: str) -> List[int]: + """ + Convert a string with integer parameters to a list of integers. + + :param params: string that contains integer parameters separated by commas. + :return: list of integers. + """ + return list(map(int, params.split(","))) + + +def precision_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float: + """ + Computes the precision metric, a metric for multi-label classification of + how many selected items are relevant. + + :param y_true: ground truth labels - expect binary values. + :param y_pred: predicted labels - expect binary values. + :param epsilon: added to the denominator to avoid any division by zero. + :return: precision metric. + """ + true_positives = numpy.sum(y_true * y_pred) + predicted_positives = numpy.sum(y_pred) + return true_positives / (predicted_positives + epsilon) + + +def recall_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float: + """ + Computes the recall metric, a metric for multi-label classification of + how many relevant items are selected. + + :param y_true: matrix with ground truth labels - expect binary values. + :param y_pred: matrix with predicted labels - expect binary values. + :param epsilon: added to the denominator to avoid any division by zero. + :return: recall metric. + """ + true_positives = numpy.sum(y_true * y_pred) + possible_positives = numpy.sum(y_true) + return true_positives / (possible_positives + epsilon) + + +def report(model: keras.engine.training.Model, X: numpy.array, y: numpy.array, batch_size: int, + threshold: float = DEFAULT_THRESHOLD, epsilon: float = EPSILON) -> None: + """ + Prints a metric report of the `model` on the data `X` & `y`. + The metrics printed are precision, recall, F1 score. + + :param model: model considered. + :param X: features. + :param y: labels (expected binary labels). + :param batch_size: batch size that will be used for prediction. + :param threshold: threshold to binarize the predictions. + :param epsilon: added to the denominator to avoid any division by zero. + """ + log = logging.getLogger("report") + + # predict & skip the last dimension & binarize + predictions = model.predict(X, batch_size=batch_size, verbose=1)[:, :, 0] + predictions = binarize(predictions, threshold) + + # report + pr = precision_np(y[:, :, 0], predictions, epsilon=epsilon) + rec = recall_np(y[:, :, 0], predictions, epsilon=epsilon) + f1 = 2 * pr * rec / (pr + rec + epsilon) + log.info("precision: %.3f, recall: %.3f, f1: %.3f" % (pr, rec, f1)) + + +def config_keras() -> None: + """ + Initializes keras backend session. + """ + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + kbackend.tensorflow_backend.set_session(tf.Session(config=config)) + + +def build_train_generator(X: numpy.array, y: numpy.array, + batch_size: int = 500) -> Iterable[Tuple[numpy.array]]: + """ + Builds the generator that yields features and their labels. + + :param X: features. + :param y: binary labels. + :param batch_size: higher values better utilize GPUs. + :return: generator of features and their labels. + """ + assert X.shape[0] == y.shape[0], "Number of samples mismatch in X and y." + + def xy_generator(): + while True: + n_batches = X.shape[0] // batch_size + if n_batches * batch_size < X.shape[0]: + n_batches += 1 # to yield last samples + for i in range(n_batches): + start = i * batch_size + end = min((i + 1) * batch_size, X.shape[0]) + yield X[start:end], y[start:end] + return xy_generator() + + +def build_schedule(lr: float, final_lr: float, n_epochs: int) -> Callable: + """ + Builds the schedule of which the learning rate decreases. + The schedule makes the learning rate decrease linearly. + + :param lr: initial learning rate. + :param final_lr: final learning rate. + :param n_epochs: number of training epochs. + :return: the schedule of the learning rate. + """ + delta = (lr - final_lr) / n_epochs + + def schedule(epoch: int) -> float: + assert 0 <= epoch < n_epochs + return lr - delta * epoch + return schedule + + +def make_lr_scheduler(lr: float, final_lr: float, n_epochs: int, + verbose: int = 1) -> keras.callbacks.LearningRateScheduler: + """ + Prepares the scheduler to decrease the learning rate while training. + + :param lr: initial learning rate. + :param final_lr: final learning rate. + :param n_epochs: number of training epochs. + :param verbose: level of verbosity. + :return: LearningRateScheduler with linear schedule of the learning rate. + """ + schedule = build_schedule(lr, final_lr, n_epochs) + return LearningRateScheduler(schedule=schedule, verbose=verbose) + + +def prepare_callbacks(output_dir: str) -> Tuple[Callable]: + """ + Prepares logging, tensorboard, model checkpoint callbacks and stores the outputs in output_dir. + + :param output_dir: path to the results. + :return: list of callbacks. + """ + time = datetime.now().strftime("%y%m%d-%H%M") + log_dir = os.path.join(output_dir, "tensorboard" + time) + logging.info("Tensorboard directory: %s" % log_dir) + tensorboard = TensorBoard(log_dir=log_dir, batch_size=1000, write_images=True, + write_graph=True) + csv_path = os.path.join(output_dir, "csv_logger_" + time + ".txt") + logging.info("CSV logs: %s" % csv_path) + csv_logger = CSVLogger(csv_path) + + filepath = os.path.join(output_dir, "best_" + time + ".model") + model_saver = ModelCheckpoint(filepath, monitor="val_recall", verbose=1, save_best_only=True, + mode="max") + return tensorboard, csv_logger, model_saver + + +def create_generator_params(batch_size: int, samples_per_epoch: int, n_samples: int, + epochs: int) -> Tuple[int]: + """ + Helper function to split a huge dataset into smaller ones to enable more frequent reports. + + :param batch_size: batch size. + :param samples_per_epoch: number of samples per mini-epoch or before each report. + :param n_samples: total number of samples. + :param epochs: number of epochs over the full dataset. + :return: number of steps per epoch (should be used with the generator) and number of sub-epochs + where during sub-epoch only samples_per_epoch will be generated. + """ + steps_per_epoch = samples_per_epoch // batch_size + n_epochs = numpy.ceil(epochs * n_samples / samples_per_epoch) + return steps_per_epoch, n_epochs diff --git a/sourced/ml/core/algorithms/swivel.py b/sourced/ml/core/algorithms/swivel.py new file mode 100644 index 0000000..bfd126f --- /dev/null +++ b/sourced/ml/core/algorithms/swivel.py @@ -0,0 +1,491 @@ +#!/usr/bin/env python3 +# +# Copyright 2016 Google Inc. All Rights Reserved. +# Copyright 2017 Sourced Technologies S. L. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Submatrix-wise Vector Embedding Learner. + +Implementation of SwiVel algorithm described at: +http://arxiv.org/abs/1602.02215 + +This program expects an input directory that contains the following files. + + row_vocab.txt, col_vocab.txt + + The row an column vocabulary files. Each file should contain one token per + line; these will be used to generate a tab-separate file containing the + trained embeddings. + + row_sums.txt, col_sum.txt + + The matrix row and column marginal sums. Each file should contain one + decimal floating point number per line which corresponds to the marginal + count of the matrix for that row or column. + + shards.recs + + A file containing the sub-matrix shards, stored as TFRecords. Each shard is + expected to be a serialzed tf.Example protocol buffer with the following + properties: + + global_row: the global row indices contained in the shard + global_col: the global column indices contained in the shard + sparse_local_row, sparse_local_col, sparse_value: three parallel arrays + that are a sparse representation of the submatrix counts. + +It will generate embeddings, training from the input directory for +the specified number of epochs. When complete, it will output the trained +vectors to a tab-separated file that contains one line per embedding. Row and +column embeddings are stored in separate files. + +""" + +import glob +import math +import os +import threading +import time + +import numpy +import tensorflow as tf +from tensorflow.python.client import device_lib + +flags = tf.app.flags + +flags.DEFINE_string("input_base_path", None, + "Directory containing input shards, vocabularies, " + "and marginals.") +flags.DEFINE_string("output_base_path", None, + "Path where to write the trained embeddings.") +flags.DEFINE_integer("embedding_size", 300, "Size of the embeddings") +flags.DEFINE_boolean("trainable_bias", False, "Biases are trainable") +flags.DEFINE_integer("submatrix_rows", 4096, + "Rows in each training submatrix. This must match " + "the training data.") +flags.DEFINE_integer("submatrix_cols", 4096, + "Rows in each training submatrix. This must match " + "the training data.") +flags.DEFINE_float("loss_multiplier", 1.0 / 4096, + "constant multiplier on loss.") +flags.DEFINE_float("confidence_exponent", 0.5, + "Exponent for l2 confidence function") +flags.DEFINE_float("confidence_scale", 0.25, + "Scale for l2 confidence function") +flags.DEFINE_float("confidence_base", 0.1, "Base for l2 confidence function") +flags.DEFINE_float("learning_rate", 1.0, "Initial learning rate") +flags.DEFINE_string("optimizer", "Adagrad", + "SGD optimizer (tf.train.*Optimizer)") +flags.DEFINE_integer("num_concurrent_steps", 2, + "Number of threads to train with") +flags.DEFINE_integer("num_readers", 4, + "Number of threads to read the input data and feed it") +flags.DEFINE_float("num_epochs", 40, "Number epochs to train for") +flags.DEFINE_float("per_process_gpu_memory_fraction", 0, + "Fraction of GPU memory to use, 0 means allow_growth") +flags.DEFINE_integer("num_gpus", 0, + "Number of GPUs to use, 0 means all available") +flags.DEFINE_string("logs", "", + "Path for TensorBoard logs (empty value disables them)") + +FLAGS = flags.FLAGS + + +def log(message, *args, **kwargs): + tf.logging.info(message, *args, **kwargs) + + +def get_available_gpus(): + return [d.name for d in device_lib.list_local_devices() + if d.device_type == "GPU"] + + +def embeddings_with_init(vocab_size, embedding_dim, name): + """Creates and initializes the embedding tensors.""" + return tf.get_variable(name=name, + shape=[vocab_size, embedding_dim], + initializer=tf.random_normal_initializer( + stddev=math.sqrt(1.0 / embedding_dim))) + + +def count_matrix_input(filenames, submatrix_rows, submatrix_cols): + """Reads submatrix shards from disk.""" + filename_queue = tf.train.string_input_producer(filenames) + reader = tf.WholeFileReader() + _, serialized_example = reader.read(filename_queue) + features = tf.parse_single_example( + serialized_example, + features={ + "global_row": tf.FixedLenFeature([submatrix_rows], dtype=tf.int64), + "global_col": tf.FixedLenFeature([submatrix_cols], dtype=tf.int64), + "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), + "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), + "sparse_value": tf.VarLenFeature(dtype=tf.float32) + }) + + global_row = features["global_row"] + global_col = features["global_col"] + + sparse_local_row = features["sparse_local_row"].values + sparse_local_col = features["sparse_local_col"].values + sparse_count = features["sparse_value"].values + + sparse_indices = tf.concat(axis=1, values=[tf.expand_dims(sparse_local_row, 1), + tf.expand_dims(sparse_local_col, 1)]) + count = tf.sparse_to_dense(sparse_indices, [submatrix_rows, submatrix_cols], + sparse_count, validate_indices=False) + + queued_global_row, queued_global_col, queued_count = tf.train.batch( + [global_row, global_col, count], + batch_size=1, + num_threads=FLAGS.num_readers, + capacity=32) + + queued_global_row = tf.reshape(queued_global_row, [submatrix_rows]) + queued_global_col = tf.reshape(queued_global_col, [submatrix_cols]) + queued_count = tf.reshape(queued_count, [submatrix_rows, submatrix_cols]) + + return queued_global_row, queued_global_col, queued_count + + +def read_marginals_file(filename): + """Reads text file with one number per line to an array.""" + with open(filename) as lines: + return [float(line) for line in lines] + + +def write_embedding_tensor_to_disk(vocab_path, output_path, sess, embedding): + """Writes tensor to output_path as tsv""" + # Fetch the embedding values from the model + embeddings = sess.run(embedding) + + with open(output_path, "w") as out_f: + with open(vocab_path) as vocab_f: + for index, word in enumerate(vocab_f): + word = word.strip() + embedding = embeddings[index] + out_f.write(word + "\t" + "\t".join( + [str(x) for x in embedding]) + "\n") + + +def write_embeddings_to_disk(config, model, sess): + """Writes row and column embeddings disk""" + # Row Embedding + row_vocab_path = config.input_base_path + "/row_vocab.txt" + row_embedding_output_path = config.output_base_path + "/row_embedding.tsv" + log("Writing row embeddings to: %s", row_embedding_output_path) + write_embedding_tensor_to_disk(row_vocab_path, row_embedding_output_path, + sess, model.row_embedding) + + # Column Embedding + col_vocab_path = config.input_base_path + "/col_vocab.txt" + col_embedding_output_path = config.output_base_path + "/col_embedding.tsv" + log("Writing column embeddings to: %s", col_embedding_output_path) + write_embedding_tensor_to_disk(col_vocab_path, col_embedding_output_path, + sess, model.col_embedding) + + +class SwivelModel: + """Small class to gather needed pieces from a Graph being built.""" + + def __init__(self, config): + """Construct graph for dmc.""" + self._config = config + + # Create paths to input data files + log("Reading model from: %s", config.input_base_path) + count_matrix_files = glob.glob(os.path.join(config.input_base_path, "shard-*.pb")) + row_sums_path = os.path.join(config.input_base_path, "row_sums.txt") + col_sums_path = os.path.join(config.input_base_path, "col_sums.txt") + + # Read marginals + row_sums = read_marginals_file(row_sums_path) + col_sums = read_marginals_file(col_sums_path) + + self.n_rows = len(row_sums) + self.n_cols = len(col_sums) + log("Matrix dim: (%d,%d) SubMatrix dim: (%d,%d)", + self.n_rows, self.n_cols, config.submatrix_rows, + config.submatrix_cols) + if self.n_cols < config.submatrix_cols: + raise ValueError( + "submatrix_cols={0} can not be bigger than columns number={1} " + "(specify submatrix_cols={1})".format(config.submatrix_cols, self.n_cols)) + if self.n_rows < config.submatrix_rows: + raise ValueError( + "submatrix_rows={0} can not be bigger than rows number={1} " + "(specify submatrix_rows={1})".format(config.submatrix_rows, self.n_cols)) + self.n_submatrices = (self.n_rows * self.n_cols / + (config.submatrix_rows * config.submatrix_cols)) + log("n_submatrices: %d", self.n_submatrices) + + with tf.device("/cpu:0"): + # ===== CREATE VARIABLES ====== + # Get input + global_row, global_col, count = count_matrix_input( + count_matrix_files, config.submatrix_rows, + config.submatrix_cols) + + # Embeddings + self.row_embedding = embeddings_with_init( + embedding_dim=config.embedding_size, + vocab_size=self.n_rows, + name="row_embedding") + self.col_embedding = embeddings_with_init( + embedding_dim=config.embedding_size, + vocab_size=self.n_cols, + name="col_embedding") + tf.summary.histogram("row_emb", self.row_embedding) + tf.summary.histogram("col_emb", self.col_embedding) + + matrix_log_sum = math.log(numpy.sum(row_sums) + 1) + row_bias_init = [math.log(x + 1) for x in row_sums] + col_bias_init = [math.log(x + 1) for x in col_sums] + self.row_bias = tf.Variable( + row_bias_init, trainable=config.trainable_bias) + self.col_bias = tf.Variable( + col_bias_init, trainable=config.trainable_bias) + tf.summary.histogram("row_bias", self.row_bias) + tf.summary.histogram("col_bias", self.col_bias) + + # Add optimizer + l2_losses = [] + sigmoid_losses = [] + self.global_step = tf.Variable(0, name="global_step") + learning_rate = tf.Variable(config.learning_rate, + name="learning_rate") + opt = getattr(tf.train, FLAGS.optimizer + "Optimizer")( + learning_rate) + tf.summary.scalar("learning_rate", learning_rate) + + all_grads = [] + + devices = ["/gpu:%d" % i for i in range(FLAGS.num_gpus)] \ + if FLAGS.num_gpus > 0 else get_available_gpus() + self.devices_number = len(devices) + if not self.devices_number: + devices = ["/cpu:0"] + self.devices_number = 1 + for dev in devices: + with tf.device(dev): + with tf.name_scope(dev[1:].replace(":", "_")): + # ===== CREATE GRAPH ===== + # Fetch embeddings. + selected_row_embedding = tf.nn.embedding_lookup( + self.row_embedding, global_row) + selected_col_embedding = tf.nn.embedding_lookup( + self.col_embedding, global_col) + + # Fetch biases. + selected_row_bias = tf.nn.embedding_lookup( + [self.row_bias], global_row) + selected_col_bias = tf.nn.embedding_lookup( + [self.col_bias], global_col) + + # Multiply the row and column embeddings to generate + # predictions. + predictions = tf.matmul( + selected_row_embedding, selected_col_embedding, + transpose_b=True) + + # These binary masks separate zero from non-zero values. + count_is_nonzero = tf.to_float(tf.cast(count, tf.bool)) + count_is_zero = 1 - count_is_nonzero + + objectives = count_is_nonzero * tf.log(count + 1e-30) + objectives -= tf.reshape( + selected_row_bias, [config.submatrix_rows, 1]) + objectives -= selected_col_bias + objectives += matrix_log_sum + + err = predictions - objectives + + # The confidence function scales the L2 loss based on + # the raw co-occurrence count. + l2_confidence = ( + config.confidence_base + + config.confidence_scale * tf.pow( + count, config.confidence_exponent)) + + l2_loss = config.loss_multiplier * tf.reduce_sum( + 0.5 * l2_confidence * err * err * count_is_nonzero) + l2_losses.append(tf.expand_dims(l2_loss, 0)) + + sigmoid_loss = config.loss_multiplier * tf.reduce_sum( + tf.nn.softplus(err) * count_is_zero) + sigmoid_losses.append(tf.expand_dims(sigmoid_loss, 0)) + + loss = l2_loss + sigmoid_loss + grads = opt.compute_gradients(loss) + all_grads.append(grads) + + with tf.device("/cpu:0"): + # ===== MERGE LOSSES ===== + l2_loss = tf.reduce_mean(tf.concat(axis=0, values=l2_losses), 0, + name="l2_loss") + sigmoid_loss = tf.reduce_mean( + tf.concat(axis=0, values=sigmoid_losses), 0, + name="sigmoid_loss") + overall_loss = l2_loss + sigmoid_loss + average = tf.train.ExponentialMovingAverage(0.999) + loss_average_op = average.apply( + (overall_loss, l2_loss, sigmoid_loss)) + self.loss = average.average(overall_loss) + tf.summary.scalar("overall_loss", self.loss) + tf.summary.scalar("l2_loss", average.average(l2_loss)) + tf.summary.scalar("sigmoid_loss", average.average(sigmoid_loss)) + + # Apply the gradients to adjust the shared variables. + apply_gradient_ops = [] + for grads in all_grads: + apply_gradient_ops.append(opt.apply_gradients( + grads, global_step=self.global_step)) + + self.train_op = tf.group(loss_average_op, *apply_gradient_ops) + self.saver = tf.train.Saver(sharded=True) + + def initialize_summary(self, sess): + log("creating TensorBoard stuff...") + self.summary = tf.summary.merge_all() + self.writer = tf.summary.FileWriter(FLAGS.logs, sess.graph) + projector_config = \ + tf.contrib.tensorboard.plugins.projector.ProjectorConfig() + embedding_config = projector_config.embeddings.add() + length = min(10000, self.n_rows, self.n_cols) + self.embedding10k = tf.Variable( + tf.zeros((length, self._config.embedding_size)), + name="top10k_embedding") + embedding_config.tensor_name = self.embedding10k.name + embedding_config.metadata_path = os.path.join( + self._config.input_base_path, "row_vocab.txt") + tf.contrib.tensorboard.plugins.projector.visualize_embeddings( + self.writer, projector_config) + self.saver = tf.train.Saver((self.embedding10k,), max_to_keep=1) + + def write_summary(self, sess): + log("writing the summary...") + length = min(10000, self.n_rows, self.n_cols) + assignment = self.embedding10k.assign( + (self.row_embedding[:length] + self.col_embedding[:length]) / 2) + summary, _, global_step = sess.run( + (self.summary, assignment, self.global_step)) + self.writer.add_summary(summary, global_step) + self.saver.save( + sess, os.path.join(FLAGS.logs, "embeddings10k.checkpoint"), + global_step) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + start_time = time.time() + + omitted = {"handler", "command"} + + log("Swivel parameters:\n" + "\n".join( + "\t{:20} {}".format(key, value) for key, value in + sorted(FLAGS.__dict__.items()) if key not in omitted)) + # Create the output path. If this fails, it really ought to fail now. :) + if not os.path.isdir(FLAGS.output_base_path): + os.makedirs(FLAGS.output_base_path) + + # Create and run model + with tf.Graph().as_default(): + log("creating the model...") + model = SwivelModel(FLAGS) + + # Create a session for running Ops on the Graph. + gpu_opts = {} + if FLAGS.per_process_gpu_memory_fraction > 0: + gpu_opts["per_process_gpu_memory_fraction"] = \ + FLAGS.per_process_gpu_memory_fraction + else: + gpu_opts["allow_growth"] = True + gpu_options = tf.GPUOptions(**gpu_opts) + sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + if FLAGS.logs: + model.initialize_summary(sess) + + # Run the Op to initialize the variables. + log("initializing the variables...") + sess.run(tf.global_variables_initializer()) + + # Start feeding input + log("starting the input threads...") + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + # Calculate how many steps each thread should run + n_total_steps = int(FLAGS.num_epochs * model.n_rows * model.n_cols) / ( + FLAGS.submatrix_rows * FLAGS.submatrix_cols) + n_steps_per_thread = n_total_steps / ( + FLAGS.num_concurrent_steps * model.devices_number) + n_submatrices_to_train = model.n_submatrices * FLAGS.num_epochs + t0 = [time.time()] + n_steps_between_status_updates = 100 + n_steps_between_summary_updates = 10000 + status_i = [0, 0] + status_lock = threading.Lock() + msg = ("%%%dd/%%d submatrices trained (%%.1f%%%%), " + "%%5.1f submatrices/sec | loss %%f") % \ + len(str(n_submatrices_to_train)) + + def TrainingFn(): + for _ in range(int(n_steps_per_thread)): + _, global_step, loss = sess.run(( + model.train_op, model.global_step, model.loss)) + + show_status = False + update_summary = False + with status_lock: + new_i = global_step // n_steps_between_status_updates + if new_i > status_i[0]: + status_i[0] = new_i + show_status = True + new_i = global_step // n_steps_between_summary_updates + if new_i > status_i[1]: + status_i[1] = new_i + update_summary = True + if show_status: + elapsed = float(time.time() - t0[0]) + log(msg, global_step, n_submatrices_to_train, + 100.0 * global_step / n_submatrices_to_train, + n_steps_between_status_updates / elapsed, loss) + t0[0] = time.time() + if update_summary and FLAGS.logs: + model.write_summary(sess) + + # Start training threads + train_threads = [] + for _ in range(FLAGS.num_concurrent_steps): + t = threading.Thread(target=TrainingFn) + train_threads.append(t) + t.start() + + # Wait for threads to finish. + for t in train_threads: + t.join() + + coord.request_stop() + coord.join(threads) + + # Write out vectors + write_embeddings_to_disk(FLAGS, model, sess) + + # Shutdown + sess.close() + log("Elapsed: %s", time.time() - start_time) + + +if __name__ == "__main__": + tf.app.run() diff --git a/sourced/ml/core/algorithms/tf_idf.py b/sourced/ml/core/algorithms/tf_idf.py new file mode 100644 index 0000000..7cbc21a --- /dev/null +++ b/sourced/ml/core/algorithms/tf_idf.py @@ -0,0 +1,5 @@ +import numpy + + +def log_tf_log_idf(tf, df, ndocs): + return numpy.log(1 + tf) * numpy.log(ndocs / df) diff --git a/sourced/ml/core/algorithms/token_parser.py b/sourced/ml/core/algorithms/token_parser.py new file mode 100644 index 0000000..fb26f1f --- /dev/null +++ b/sourced/ml/core/algorithms/token_parser.py @@ -0,0 +1,135 @@ +import re + +import Stemmer + + +class TokenParser: + """ + Common utilities for splitting and stemming tokens. + """ + NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+") #: Regexp to split source code identifiers. + STEM_THRESHOLD = 6 #: We do not stem split parts shorter than or equal to this size. + MAX_TOKEN_LENGTH = 256 #: We cut identifiers longer than this value. + MIN_SPLIT_LENGTH = 3 #: We do not split source code identifiers shorter than this value. + DEFAULT_SINGLE_SHOT = False #: True if we do not want to join small identifiers to next one. + # Example: 'sourced.ml.algorithms' -> ["sourc", "sourcedml", "algorithm", "mlalgorithm"]. + # if True we have only ["sourc", "algorithm"]. + # if you do not want to filter small tokens set min_split_length=1. + + def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH, + min_split_length=MIN_SPLIT_LENGTH, single_shot=DEFAULT_SINGLE_SHOT): + self._stemmer = Stemmer.Stemmer("english") + self._stemmer.maxCacheSize = 0 + self._stem_threshold = stem_threshold + self._max_token_length = max_token_length + self._min_split_length = min_split_length + self._single_shot = single_shot + + @property + def stem_threshold(self): + return self._stem_threshold + + @stem_threshold.setter + def stem_threshold(self, value): + if not isinstance(value, int): + raise TypeError("stem_threshold must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("stem_threshold must be greater than 0 - got %d" % value) + self._stem_threshold = value + + @property + def max_token_length(self): + return self._max_token_length + + @max_token_length.setter + def max_token_length(self, value): + if not isinstance(value, int): + raise TypeError("max_token_length must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("max_token_length must be greater than 0 - got %d" % value) + self._max_token_length = value + + @property + def min_split_length(self): + return self._min_split_length + + @min_split_length.setter + def min_split_length(self, value): + if not isinstance(value, int): + raise TypeError("min_split_length must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("min_split_length must be greater than 0 - got %d" % value) + self._min_split_length = value + + def __call__(self, token): + return self.process_token(token) + + def process_token(self, token): + for word in self.split(token): + yield self.stem(word) + + def stem(self, word): + if len(word) <= self.stem_threshold: + return word + return self._stemmer.stemWord(word) + + def split(self, token): + token = token.strip()[:self.max_token_length] + + def ret(name): + r = name.lower() + if len(name) >= self.min_split_length: + ret.last_subtoken = r + yield r + if ret.prev_p and not self._single_shot: + yield ret.prev_p + r + ret.prev_p = "" + elif not self._single_shot: + ret.prev_p = r + yield ret.last_subtoken + r + ret.last_subtoken = "" + ret.prev_p = "" + ret.last_subtoken = "" + + for part in self.NAME_BREAKUP_RE.split(token): + if not part: + continue + prev = part[0] + pos = 0 + for i in range(1, len(part)): + this = part[i] + if prev.islower() and this.isupper(): + yield from ret(part[pos:i]) + pos = i + elif prev.isupper() and this.islower(): + if 0 < i - 1 - pos <= self.min_split_length: + yield from ret(part[pos:i]) + pos = i + elif i - 1 > pos: + yield from ret(part[pos:i]) + pos = i + prev = this + last = part[pos:] + if last: + yield from ret(last) + + def __getstate__(self): + state = self.__dict__.copy() + del state["_stemmer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + self._stemmer = Stemmer.Stemmer("english") + + +class NoopTokenParser: + """ + One can use this class if he or she does not want to do any parsing. + """ + + def process_token(self, token): + yield token + + def __call__(self, token): + return self.process_token(token) diff --git a/sourced/ml/core/algorithms/uast_id_distance.py b/sourced/ml/core/algorithms/uast_id_distance.py new file mode 100644 index 0000000..8be6fb9 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_id_distance.py @@ -0,0 +1,122 @@ +from itertools import combinations +from typing import Iterable, Tuple, Union + +import bblfsh + +from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag +from sourced.ml.core.utils import bblfsh_roles + + +class Uast2IdDistance(UastIds2Bag): + """ + Converts a UAST to a list of identifiers pair and UAST distance between. + Distance metric must be defined in the inheritors. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + DEFAULT_MAX_DISTANCE = 10 # to avoid collecting all distances we skip too big ones + + def __init__(self, token2index=None, token_parser=None, max_distance=DEFAULT_MAX_DISTANCE): + """ + :param token2index: The mapping from tokens to token key. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + :param max_distance: specify to skip too distant identifiers + """ + super().__init__(token2index=token2index, token_parser=token_parser) + self.max_distance = max_distance + + def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]: + """ + Converts a UAST to a list of identifiers pair and UAST distance between. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: a list of (from identifier, to identifier) and distance pairs. + """ + for point1, point2 in combinations(self._process_uast(uast), 2): + if point1[0] == point2[0]: + continue # We do not want to calculate distance between the same identifiers + distance = self.distance(point1, point2) + if distance < self.max_distance: + yield ((point1[0], point2[0]) if point1[0] > point2[0] else + (point2[0], point1[0])), distance + + def distance(self, point1, point2) -> Union[int, float]: + """ + Calculate distance between two points. A point can be anything. self._process_uast returns + list of points in the specific class. + + :return: Distance between two points. + """ + raise NotImplementedError + + def _process_uast(self, node: bblfsh.Node) -> Iterable: + """ + Converts uast to points list. A point can be anything you need to calculate distance. + """ + raise NotImplementedError + + def _process_point(self, node, info): + if bblfsh_roles.IDENTIFIER in node.roles and node.token: + for sub in self._token_parser.process_token(node.token): + try: + yield (self._token2index[sub], info) + except KeyError: + continue + + +class Uast2IdTreeDistance(Uast2IdDistance): + """ + Converts a UAST to a list of identifiers pair and UAST tree distance between. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + def _process_uast(self, uast: bblfsh.Node) -> Iterable: + stack = [(uast, [])] + while stack: + node, ancestors = stack.pop() + yield from self._process_point(node, ancestors) + ancestors = list(ancestors) + ancestors.append(node) + stack.extend([(child, ancestors) for child in node.children]) + + def distance(self, point1, point2) -> int: + i = 0 + ancestors1 = point1[1] + ancestors2 = point2[1] + for i, (ancestor1, ancestor2) in enumerate(zip(ancestors1, ancestors2)): # noqa: B007 + if ancestor1 != ancestor2: + break + distance = self.calc_tree_distance(i, len(ancestors1), len(ancestors2)) + return distance + + @staticmethod + def calc_tree_distance(last_common_level, level1, level2): + return level1 + level2 - 2 * last_common_level + + +class Uast2IdLineDistance(Uast2IdDistance): + """ + Converts a UAST to a list of identifiers pair and code line distance between where applicable. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + def _process_uast(self, uast): + stack = [(uast, [0, 0])] + while stack: + node, last_position = stack.pop() + if node.start_position.line != 0: + # A lot of Nodes do not have position + # It is good heuristic to take the last Node in tree with a position. + last_position[0] = node.start_position.line + last_position[1] = 0 + if node.start_position.col != 0: + last_position[1] = node.start_position.col + yield from self._process_point(node, last_position) + stack.extend([(child, list(last_position)) for child in node.children]) + + def distance(self, point1, point2): + return abs(point1[1][0] - point2[1][0]) # subtract line numbers diff --git a/sourced/ml/core/algorithms/uast_ids_to_bag.py b/sourced/ml/core/algorithms/uast_ids_to_bag.py new file mode 100644 index 0000000..2e02f7f --- /dev/null +++ b/sourced/ml/core/algorithms/uast_ids_to_bag.py @@ -0,0 +1,110 @@ +from collections import defaultdict, deque + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser +from sourced.ml.core.algorithms.uast_to_bag import Uast2BagBase +from sourced.ml.core.utils import bblfsh_roles + + +def uast2sequence(root): + sequence = [] + nodes = defaultdict(deque) + stack = [root] + nodes[id(root)].extend(root.children) + while stack: + if nodes[id(stack[-1])]: + child = nodes[id(stack[-1])].popleft() + nodes[id(child)].extend(child.children) + stack.append(child) + else: + sequence.append(stack.pop()) + return sequence + + +class FakeVocabulary: + # FIXME(zurk): change to simple function. Vadim Markovtsev comments: + # > would rather made this a simple function and change roles2index + # type from [] to callable. Saves time to understand. + def __getitem__(self, item): + return item + + +class UastTokens2Bag(Uast2BagBase): + """ + Converts a UAST to a weighed bag of tokens via xpath. + """ + + XPATH = None # Should be overridden in child class + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'NoopTokenParser' is used if it is not specified. + """ + self._token2index = FakeVocabulary() if token2index is None else token2index + self._token_parser = NoopTokenParser() if token_parser is None else token_parser + + @property + def token_parser(self): + return self._token_parser + + @property + def token2index(self): + return self._token2index + + def __call__(self, uast): + """ + Converts a UAST to a weighed bag-of-words. The weights are words frequencies. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: + """ + nodes = bblfsh.filter(uast, self.XPATH) + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag + + +class UastIds2Bag(UastTokens2Bag): + """ + Converts a UAST to a bag-of-identifiers. + """ + + XPATH = "//*[@roleIdentifier]" + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + """ + token_parser = TokenParser() if token_parser is None else token_parser + super().__init__(token2index, token_parser) + + def __call__(self, uast): + """ + HOTFIX for https://github.com/bblfsh/client-python/issues/92 + Converts a UAST to a weighed bag-of-identifiers. The weights are identifiers frequencies. + The tokens are preprocessed by _token_parser. + Overwrite __call__ to avoid issues with `bblfsh.filter`. + + :param uast: The UAST root node. + :return: bag + """ + nodes = [node for node in uast2sequence(uast) if bblfsh_roles.IDENTIFIER in node.roles] + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag diff --git a/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py b/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py new file mode 100644 index 0000000..d089756 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_inttypes_to_graphlets.py @@ -0,0 +1,59 @@ +from collections import defaultdict + +from sourced.ml.core.algorithms.uast_ids_to_bag import Uast2BagBase +from sourced.ml.core.algorithms.uast_struct_to_bag import Node + + +class Uast2GraphletBag(Uast2BagBase): + """ + Converts a UAST to a bag of graphlets. + The graphlet of a UAST node is composed from the node itself, its parent and its children. + Each node is represented by the internal role string. + """ + @staticmethod + def _extract_node(node, parent): + return Node(parent=parent, internal_type=node.internal_type) + + def uast2graphlets(self, uast): + """ + :param uast: The UAST root node. + :generate: The nodes which compose the UAST. + :class: 'Node' is used to access the nodes of the graphlets. + """ + root = self._extract_node(uast, None) + stack = [(root, uast)] + while stack: + parent, parent_uast = stack.pop() + children_nodes = [self._extract_node(child, parent) for child in parent_uast.children] + parent.children = children_nodes + stack.extend(zip(children_nodes, parent_uast.children)) + yield parent + + def node2key(self, node): + """ + Builds the string joining internal types of all the nodes + in the node's graphlet in the following order: + parent_node_child1_child2_child3. The children are sorted by alphabetic order. + str format is required for BagsExtractor. + + :param node: a node of UAST + :return: The string key of node + """ + try: + parent_type = node.parent.internal_type + except AttributeError: + parent_type = None + key = [parent_type, node.internal_type] + key.extend(sorted(ch.internal_type for ch in node.children)) + return "_".join(map(str, key)) + + def __call__(self, uast): + """ + Converts a UAST to a weighed bag of graphlets. The weights are graphlets frequencies. + :param uast: The UAST root node. + :return: bag of graphlets. + """ + bag = defaultdict(int) + for node in self.uast2graphlets(uast): + bag[self.node2key(node)] += 1 + return bag diff --git a/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py b/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py new file mode 100644 index 0000000..5d9daee --- /dev/null +++ b/sourced/ml/core/algorithms/uast_inttypes_to_nodes.py @@ -0,0 +1,64 @@ +from typing import Iterable, Tuple, Union + +from bblfsh import Node +import numpy + +from sourced.ml.core.algorithms.uast_to_bag import Uast2BagThroughSingleScan + + +class Uast2QuantizedChildren(Uast2BagThroughSingleScan): + """ + Converts a UAST to a bag of children counts. + """ + + def __init__(self, npartitions: int = 20): + self.npartitions = npartitions + self.levels = {} + + def node2key(self, node: Node) -> Union[str, Tuple[str, int]]: + """ + :param node: a node in UAST. + :return: The string which consists of the internal type of the node and its number of + children. + """ + if not self.levels: + return node.internal_type, len(node.children) + qm = self.levels[node.internal_type] + quant_index = numpy.searchsorted(qm, len(node.children), side="right") - 1 + return "%s_%d" % (node.internal_type, quant_index) + + def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]): + for key, vals in frequencies: + self.levels[key] = self.quantize_unwrapped(vals) + + def quantize_unwrapped(self, children_freq: Iterable[Tuple[int, int]]) -> numpy.ndarray: + """ + Builds the quantization partition P that is a vector of length nb_partitions \ + whose entries are in strictly ascending order. + Quantization of x is defined as: + 0 if x <= P[0] + m if P[m-1] < x <= P[m] + n if P[n] <= x + + :param children_freq: distribution of the number of children. + :return: The array with quantization levels. + """ + levels = numpy.zeros(self.npartitions + 1, dtype=numpy.int32) + children_freq = sorted(children_freq) + max_nodes_per_bin = sum(i[1] for i in children_freq) / self.npartitions + levels[0] = children_freq[0][0] + accum = children_freq[0][1] + i = 1 + for v, f in children_freq[1:]: + accum += f + if accum > max_nodes_per_bin: + accum = f + if i < len(levels): + levels[i] = v + i += 1 + last = children_freq[-1][0] + if i < len(levels): + levels[i:] = last + else: + levels[-1] = last + return levels diff --git a/sourced/ml/core/algorithms/uast_struct_to_bag.py b/sourced/ml/core/algorithms/uast_struct_to_bag.py new file mode 100644 index 0000000..5ee4ce1 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_struct_to_bag.py @@ -0,0 +1,180 @@ +from collections import defaultdict +import random + +from sourced.ml.core.algorithms.uast_ids_to_bag import FakeVocabulary, Uast2BagBase, uast2sequence + + +class Uast2StructBagBase(Uast2BagBase): + SEP = ">" + + def __init__(self, stride, seq_len, node2index=None): + self._node2index = node2index if node2index is not None else FakeVocabulary() + self._stride = stride + if not isinstance(seq_len, (int, tuple, list)): + raise TypeError("Unexpected type of seq_len: %s" % type(seq_len)) + self._seq_lens = [seq_len] if isinstance(seq_len, int) else seq_len + + @property + def node2index(self): + return self._node2index + + +class Node2InternalType: + # FIXME(zurk): change to simple function. Vadim Markovtsev comments: + # > would rather made this a simple function and change roles2index + # type from [] to callable. Saves time to understand. + def __getitem__(self, item): + return item.internal_type + + +class UastSeq2Bag(Uast2StructBagBase): + """ + DFS traversal + preserves the order of node children. + """ + + def __init__(self, stride=1, seq_len=(3, 4), node2index=None): + _node2index = Node2InternalType() if node2index is None else node2index + super().__init__(stride, seq_len, _node2index) + + def __call__(self, uast): + bag = defaultdict(int) + node_sequence = uast2sequence(uast) + + # convert to str - requirement from wmhash.BagsExtractor + node_sequence = [self.node2index[n] for n in node_sequence] + + for seq_len in self._seq_lens: + for i in range(0, len(node_sequence) - seq_len + 1, self._stride): + key = self.SEP.join(node_sequence[i:i + seq_len]) + bag[key] += 1 + return bag + + +class Node: + def __init__(self, parent=None, internal_type=None): + self.parent = parent + self.internal_type = internal_type + self.children = [] + + @property + def neighbours(self): + neighbours = [] + if self.parent is not None: + neighbours.append(self.parent) + neighbours.extend(self.children) + return neighbours + + +class Uast2RandomWalks: + """ + Generation of random walks for UAST. + """ + + def __init__(self, p_explore_neighborhood, q_leave_neighborhood, n_walks, n_steps, + node2index=None, seed=None): + """ + Related article: https://arxiv.org/abs/1607.00653 + + :param p_explore_neighborhood: return parameter, p. Parameter p controls the likelihood of\ + immediately revisiting a node in the walk. Setting it to a\ + high value (> max(q, 1)) ensures that we are less likely to\ + sample an already visited node in the following two steps\ + (unless the next node in the walk had no other neighbor).\ + This strategy encourages moderate exploration and avoids\ + 2-hop redundancy in sampling. + :param q_leave_neighborhood: in-out parameter, q. Parameter q allows the search to\ + differentiate between “inward” and “outward” nodes. Such \ + walks obtain a local view of the underlying graph with \ + respect to the start node in the walk and approximate BFS \ + behavior in the sense that our samples comprise of nodes \ + within a small locality. + :param n_walks: Number of walks from each node. + :param n_steps: Number of steps in walk. + :param node2index: Specify node2index transformation. Node2InternalType() is used as \ + default. + :param seed: Random seed. + """ + self.p_explore_neighborhood = p_explore_neighborhood + self.q_leave_neighborhood = q_leave_neighborhood + self.n_walks = n_walks + self.n_steps = n_steps + self.node2index = node2index if node2index is not None else Node2InternalType() + if seed is not None: + random.seed(seed) + + def __call__(self, uast): + starting_nodes = self.prepare_starting_nodes(uast) + for _ in range(self.n_walks): + for start_node in starting_nodes: + yield self.random_walk(start_node) + + @staticmethod + def _extract_node(node, parent): + return Node(parent=parent, internal_type=node.internal_type) + + def prepare_starting_nodes(self, uast): + starting_nodes = [] + root = self._extract_node(uast, None) + stack = [(root, uast)] + while stack: + parent, parent_uast = stack.pop() + children_nodes = [self._extract_node(child, parent) for child in parent_uast.children] + parent.children = children_nodes + stack.extend(zip(children_nodes, parent_uast.children)) + starting_nodes.append(parent) + + return starting_nodes + + def random_walk(self, node): + walk = [node] + while len(walk) < self.n_steps: + walk.append(self.alias_sample(walk)) + + walk = [self.node2index[n] for n in walk] + return walk + + def alias_sample(self, walk): + """ + Compare to node2vec this sampling is a bit simpler because there is no loop in tree -> + so there are only 2 options with unnormalized probabilities 1/p & 1/q + Related article: https://arxiv.org/abs/1607.00653 + + :param walk: list of visited nodes + :return: next node to visit + """ + last_node = walk[-1] # correspond to node v in article + + if len(walk) == 1: + choice_list = last_node.children + if last_node.parent is not None: + choice_list.append(last_node.parent) + if len(choice_list) == 0: + return last_node + return random.choice(last_node.children) + + threshold = (1 / self.p_explore_neighborhood) + threshold /= (threshold + len(last_node.children) / self.q_leave_neighborhood) + + if random.random() <= threshold: + # With threshold probability we need to return back to previous node. + return walk[-2] # Node from previous step. Correspond to node t in article. + + return random.choice(last_node.neighbours) + + +class UastRandomWalk2Bag(Uast2StructBagBase): + def __init__(self, p_explore_neighborhood=0.79, q_leave_neighborhood=0.82, n_walks=2, + n_steps=10, stride=1, seq_len=(2, 3), seed=42): + super().__init__(stride, seq_len) + self.uast2walks = Uast2RandomWalks(p_explore_neighborhood=p_explore_neighborhood, + q_leave_neighborhood=q_leave_neighborhood, + n_walks=n_walks, n_steps=n_steps, seed=seed) + + def __call__(self, uast): + bag = defaultdict(int) + for walk in self.uast2walks(uast): + for seq_len in self._seq_lens: + for i in range(0, len(walk) - seq_len + 1, self._stride): + # convert to str - requirement from wmhash.BagsExtractor + bag[self.SEP.join(walk[i:i + seq_len])] += 1 + return bag diff --git a/sourced/ml/core/algorithms/uast_to_bag.py b/sourced/ml/core/algorithms/uast_to_bag.py new file mode 100644 index 0000000..2d78ad5 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_to_bag.py @@ -0,0 +1,34 @@ +from collections import defaultdict +from typing import Dict + +from bblfsh import Node + + +class Uast2BagBase: + """ + Base class to convert UAST to a bag of anything. + """ + def __call__(self, uast: Node): + """ + Inheritors must implement this function. + + :param uast: The UAST root node. + """ + raise NotImplementedError + + +class Uast2BagThroughSingleScan(Uast2BagBase): + """ + Constructs the bag by doing a single tree traversal and turning every node into a string. + """ + def __call__(self, uast: Node) -> Dict[str, int]: + result = defaultdict(int) + stack = [uast] + while stack: + node = stack.pop() + stack.extend(node.children) + result[self.node2key(node)] += 1 + return result + + def node2key(self, node) -> str: + raise NotImplementedError diff --git a/sourced/ml/core/algorithms/uast_to_id_sequence.py b/sourced/ml/core/algorithms/uast_to_id_sequence.py new file mode 100644 index 0000000..1829bd7 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_to_id_sequence.py @@ -0,0 +1,30 @@ +from typing import Iterable + +import bblfsh + +from sourced.ml.core.algorithms.uast_id_distance import Uast2IdLineDistance + + +class Uast2IdSequence(Uast2IdLineDistance): + """ + Converts a UAST to a sorted sequence of identifiers. + Identifiers are sorted by position in code. + We do not change the order if positions are not present. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + def __call__(self, uast: bblfsh.Node) -> str: + """ + Converts a UAST to a sorted sequence of identifiers. + Identifiers are sorted by position in code. + We do not change the order if positions are not present. + + :param uast: The UAST root node. + :return: string with a sequence of identifiers + """ + return self.concat(id for id, pos in sorted(self._process_uast(uast), key=lambda x: x[1])) + + @staticmethod + def concat(id_sequence: Iterable): + return " ".join(id_sequence) diff --git a/sourced/ml/core/algorithms/uast_to_role_id_pairs.py b/sourced/ml/core/algorithms/uast_to_role_id_pairs.py new file mode 100644 index 0000000..08a81d7 --- /dev/null +++ b/sourced/ml/core/algorithms/uast_to_role_id_pairs.py @@ -0,0 +1,69 @@ +from typing import Iterable, Tuple + +import bblfsh + +from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag +from sourced.ml.core.utils import bblfsh_roles + + +class Uast2RoleIdPairs(UastIds2Bag): + """ + Converts a UAST to a list of pairs. Pair is identifier and role, where role is Node role + where identifier was found. + + __call__ is overridden here and returns list instead of bag-of-words (dist). + """ + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to token key. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + + """ + super().__init__(token2index=token2index, token_parser=token_parser) + self.exclude_roles = { + bblfsh_roles.EXPRESSION, + bblfsh_roles.IDENTIFIER, + bblfsh_roles.LEFT, + bblfsh_roles.QUALIFIED, + bblfsh_roles.BINARY, + bblfsh_roles.ASSIGNMENT, + } + + def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str]]: + """ + Converts a UAST to a list of identifier, role pairs. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: a list of identifier, role pairs. + """ + yield from self._process_uast(uast, []) + + def _process_uast(self, uast: bblfsh.Node, ancestors): + stack = [(uast, [])] + while stack: + node, ancestors = stack.pop() + + if bblfsh_roles.IDENTIFIER in node.roles and node.token: + roles = set(node.roles) + indx = -1 + # We skip all Nodes with roles from `self.exclude_roles` set. + # We skip any Node with OPERATOR role. + # For them we take first parent Node from stack with another Role set. + while not (roles - self.exclude_roles and bblfsh_roles.OPERATOR not in roles): + roles = set(ancestors[indx].roles) + indx -= 1 + for sub in self._token_parser.process_token(node.token): + try: + yield (self._token2index[sub], self.merge_roles(roles)) + except KeyError: + continue + ancestors = list(ancestors) + ancestors.append(node) + stack.extend([(child, ancestors) for child in node.children]) + + @staticmethod + def merge_roles(roles: Iterable[int]): + return " | ".join(bblfsh.role_name(r) for r in sorted(roles)) diff --git a/sourced/ml/core/extractors/__init__.py b/sourced/ml/core/extractors/__init__.py new file mode 100644 index 0000000..2d6f137 --- /dev/null +++ b/sourced/ml/core/extractors/__init__.py @@ -0,0 +1,12 @@ +# flake8: noqa +from sourced.ml.core.extractors.helpers import __extractors__, get_names_from_kwargs, \ + register_extractor, filter_kwargs, create_extractors_from_args +from sourced.ml.core.extractors.bags_extractor import Extractor, BagsExtractor, RoleIdsExtractor +from sourced.ml.core.extractors.identifiers import IdentifiersBagExtractor +from sourced.ml.core.extractors.literals import LiteralsBagExtractor +from sourced.ml.core.extractors.uast_random_walk import UastRandomWalkBagExtractor +from sourced.ml.core.extractors.uast_seq import UastSeqBagExtractor +from sourced.ml.core.extractors.children import ChildrenBagExtractor +from sourced.ml.core.extractors.graphlets import GraphletBagExtractor +from sourced.ml.core.extractors.identifier_distance import IdentifierDistance +from sourced.ml.core.extractors.id_sequence import IdSequenceExtractor diff --git a/sourced/ml/core/extractors/bags_extractor.py b/sourced/ml/core/extractors/bags_extractor.py new file mode 100644 index 0000000..cf6637f --- /dev/null +++ b/sourced/ml/core/extractors/bags_extractor.py @@ -0,0 +1,95 @@ +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast_to_role_id_pairs import Uast2RoleIdPairs +from sourced.ml.core.utils.pickleable_logger import PickleableLogger + + +class Extractor(PickleableLogger): + """ + Converts a single UAST via `algorithm` to anything you need. + It is a wrapper to use in `Uast2Features` Transformer in a pipeline. + """ + NAME = None # feature scheme name, should be overridden in the derived class. + ALGORITHM = None # algorithm class to extract from UAST + OPTS = {} # cmdline args which are passed into __init__() + + def _get_log_name(self): + return type(self).__name__ + + @classmethod + def get_kwargs_fromcmdline(cls, args): + prefix = cls.NAME + "_" + result = {} + for k, v in args.__dict__.items(): + if k.startswith(prefix): + result[k[len(prefix):]] = v + return result + + def extract(self, uast: bblfsh.Node): + yield from self.ALGORITHM(uast) + + +class BagsExtractor(Extractor): + """ + Converts a single UAST into the weighted set (dictionary), where elements are strings + and the values are floats. The derived classes must implement uast_to_bag(). + """ + DEFAULT_DOCFREQ_THRESHOLD = 5 + NAMESPACE = None # the beginning of each element in the bag + OPTS = {"weight": 1} # cmdline args which are passed into __init__() + + def __init__(self, docfreq_threshold=None, weight=None, **kwargs): + """ + :param docfreq_threshold: The minimum number of occurrences of an element to be included \ + into the bag + :param weight: TF-IDF will be multiplied by this weight to change importance of specific \ + bag extractor + :param kwargs: Parameters for parent constructor. + """ + super().__init__(**kwargs) + if docfreq_threshold is None: + docfreq_threshold = self.DEFAULT_DOCFREQ_THRESHOLD + self.docfreq_threshold = docfreq_threshold + self.docfreq = {} + self._ndocs = 0 + if weight is None: + self.weight = 1 + else: + self.weight = weight + + @property + def docfreq_threhold(self): + return self._docfreq_threshold + + @docfreq_threhold.setter + def docfreq_threshold(self, value): + if not isinstance(value, int): + raise TypeError("docfreq_threshold must be an integer, got %s" % type(value)) + if value < 1: + raise ValueError("docfreq_threshold must be >= 1, got %d" % value) + self._docfreq_threshold = value + + @property + def ndocs(self): + return self._ndocs + + @ndocs.setter + def ndocs(self, value): + if not isinstance(value, int): + raise TypeError("ndocs must be an integer, got %s" % type(value)) + if value < 1: + raise ValueError("ndocs must be >= 1, got %d" % value) + self._ndocs = value + + def extract(self, uast): + for key, val in self.uast_to_bag(uast).items(): + yield self.NAMESPACE + key, val * self.weight + + def uast_to_bag(self, uast): + raise NotImplementedError + + +class RoleIdsExtractor(Extractor): + NAME = "roleids" + ALGORITHM = Uast2RoleIdPairs(token_parser=NoopTokenParser()) diff --git a/sourced/ml/core/extractors/children.py b/sourced/ml/core/extractors/children.py new file mode 100644 index 0000000..6f2b7ff --- /dev/null +++ b/sourced/ml/core/extractors/children.py @@ -0,0 +1,49 @@ +import logging +from typing import Iterable, Tuple + +from sourced.ml.core.algorithms.uast_inttypes_to_nodes import Uast2QuantizedChildren +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import (filter_kwargs, get_names_from_kwargs, + register_extractor) + + +@register_extractor +class ChildrenBagExtractor(BagsExtractor): + """ + Converts a UAST to the bag of pairs (internal type, quantized number of children). + """ + NAME = "children" + NAMESPACE = "c." + OPTS = dict(get_names_from_kwargs(Uast2QuantizedChildren.__init__)) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, Uast2QuantizedChildren.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast_to_bag = Uast2QuantizedChildren(**uast2bag_kwargs) + + @property + def npartitions(self): + return self.uast_to_bag.npartitions + + @property + def levels(self): + return self.uast_to_bag.levels + + def extract(self, uast): + if not self.uast_to_bag.levels: + # bypass NAMESPACE + gen = self.uast_to_bag(uast).items() + else: + gen = super().extract(uast) + for key, val in gen: + yield key, val + + def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]): + self.uast_to_bag.quantize(frequencies) + if self._log.isEnabledFor(logging.DEBUG): + for k, v in self.uast_to_bag.levels.items(): + self._log.debug("%s\n%s", k, v) diff --git a/sourced/ml/core/extractors/graphlets.py b/sourced/ml/core/extractors/graphlets.py new file mode 100644 index 0000000..bb9cf6b --- /dev/null +++ b/sourced/ml/core/extractors/graphlets.py @@ -0,0 +1,25 @@ +from sourced.ml.core.algorithms.uast_inttypes_to_graphlets import Uast2GraphletBag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import \ + (filter_kwargs, get_names_from_kwargs, register_extractor) + + +@register_extractor +class GraphletBagExtractor(BagsExtractor): + NAME = "graphlet" + NAMESPACE = "g." + OPTS = dict(get_names_from_kwargs(Uast2GraphletBag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__) + self.uast2bag = Uast2GraphletBag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/extractors/helpers.py b/sourced/ml/core/extractors/helpers.py new file mode 100644 index 0000000..885c7d5 --- /dev/null +++ b/sourced/ml/core/extractors/helpers.py @@ -0,0 +1,32 @@ +import argparse +import inspect +from typing import List + +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + +__extractors__ = {} + + +def register_extractor(cls): + if not issubclass(cls, BagsExtractor): + raise TypeError("%s is not an instance of %s" % (cls.__name__, BagsExtractor.__name__)) + __extractors__[cls.NAME] = cls + return cls + + +def get_names_from_kwargs(f): + for k, v in inspect.signature(f).parameters.items(): + if v.default != inspect.Parameter.empty and isinstance( + v.default, (str, int, float, tuple)): + yield k.replace("_", "-"), v.default + + +def filter_kwargs(kwargs, func): + func_param = inspect.signature(func).parameters.keys() + return {k: v for k, v in kwargs.items() if k in func_param} + + +def create_extractors_from_args(args: argparse.Namespace) -> List[BagsExtractor]: + return [__extractors__[s](args.min_docfreq, log_level=args.log_level, + **__extractors__[s].get_kwargs_fromcmdline(args)) + for s in args.feature] diff --git a/sourced/ml/core/extractors/id_sequence.py b/sourced/ml/core/extractors/id_sequence.py new file mode 100644 index 0000000..b793d87 --- /dev/null +++ b/sourced/ml/core/extractors/id_sequence.py @@ -0,0 +1,32 @@ +from typing import Iterable + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast_to_id_sequence import Uast2IdSequence +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + + +class IdSequenceExtractor(BagsExtractor): + """ + Extractor wrapper for Uast2RoleIdPairs algorithm. + Note that this is unusual BagsExtractor since it returns iterable instead of bag. + + The class did not wrap with @register_extractor because it does not produce bags as others do. + So nobody outside code will see it or use it directly. + For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want. + + TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor), + re-inherit this class from Extractor, delete explanations from docstring. + """ + NAMESPACE = "" + NAME = "id sequence" + OPTS = {} + + def __init__(self, split_stem=False, **kwargs): + super().__init__(**kwargs) + self.uast2id_sequence = Uast2IdSequence( + None, NoopTokenParser() if not split_stem else None) + + def extract(self, uast: bblfsh.Node) -> Iterable[str]: + yield self.uast2id_sequence(uast), None diff --git a/sourced/ml/core/extractors/identifier_distance.py b/sourced/ml/core/extractors/identifier_distance.py new file mode 100644 index 0000000..8168913 --- /dev/null +++ b/sourced/ml/core/extractors/identifier_distance.py @@ -0,0 +1,49 @@ +from typing import Iterable, Tuple + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast_id_distance import (Uast2IdDistance, Uast2IdLineDistance, + Uast2IdTreeDistance) +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + + +class IdentifierDistance(BagsExtractor): + """ + Extractor wrapper for Uast2IdTreeDistance and Uast2IdLineDistance algorithm. + Note that this is an unusual BagsExtractor since it returns iterable instead of bag. + + The class did not wrap with @register_extractor because it does not produce bags as others do. + So nobody outside code will see it or use it directly. + For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want. + + TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor), + re-inherit this class from Extractor, delete explanations from docstring. + """ + NAMESPACE = "" + NAME = "Identifier distance" + OPTS = {} + DEFAULT_MAX_DISTANCE = Uast2IdDistance.DEFAULT_MAX_DISTANCE + + class DistanceType: + Tree = "tree" + Line = "line" + All = {Tree, Line} + + @staticmethod + def resolve(type): + if type == IdentifierDistance.DistanceType.Line: + return Uast2IdLineDistance + if type == IdentifierDistance.DistanceType.Tree: + return Uast2IdTreeDistance + raise ValueError("Unknown distance type: %s" % type) + + def __init__(self, split_stem=False, type="tree", max_distance=DEFAULT_MAX_DISTANCE, **kwargs): + super().__init__(**kwargs) + Uast2IdDistance = self.DistanceType.resolve(type) + self.uast2id_distance = Uast2IdDistance( + token_parser=NoopTokenParser() if not split_stem else None, + max_distance=max_distance) + + def extract(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]: + yield from self.uast2id_distance(uast) diff --git a/sourced/ml/core/extractors/identifiers.py b/sourced/ml/core/extractors/identifiers.py new file mode 100644 index 0000000..375d594 --- /dev/null +++ b/sourced/ml/core/extractors/identifiers.py @@ -0,0 +1,19 @@ +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import register_extractor + + +@register_extractor +class IdentifiersBagExtractor(BagsExtractor): + NAME = "id" + NAMESPACE = "i." + OPTS = {"split-stem": True} + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, split_stem=True, **kwargs): + super().__init__(docfreq_threshold, **kwargs) + self.id2bag = UastIds2Bag(None, NoopTokenParser() if not split_stem else None) + + def uast_to_bag(self, uast): + return self.id2bag(uast) diff --git a/sourced/ml/core/extractors/literals.py b/sourced/ml/core/extractors/literals.py new file mode 100644 index 0000000..6430d9e --- /dev/null +++ b/sourced/ml/core/extractors/literals.py @@ -0,0 +1,67 @@ +import codecs +from collections import defaultdict +import os + +from sourced.ml.core.algorithms.uast_ids_to_bag import uast2sequence, UastIds2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import register_extractor +from sourced.ml.core.utils import bblfsh_roles + + +class HashedTokenParser: + def process_token(self, token): + yield codecs.encode((hash(token) & 0xffffffffffffffff).to_bytes(8, "little"), + "hex_codec").decode() + + +class Literals2Bag(UastIds2Bag): + """ + Converts a UAST to a bag-of-literals. + """ + + XPATH = "//*[@roleLiteral]" + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + """ + token_parser = HashedTokenParser() if token_parser is None else token_parser + super().__init__(token2index, token_parser) + + def __call__(self, uast): + """ + HOTFIX for https://github.com/bblfsh/client-python/issues/92 + Converts a UAST to a weighed bag-of-literals. The weights are literals frequencies. + The tokens are preprocessed by _token_parser. + Overwrite __call__ to avoid issues with `bblfsh.filter`. + + :param uast: The UAST root node. + :return: bag + """ + nodes = [node for node in uast2sequence(uast) if bblfsh_roles.LITERAL in node.roles] + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag + + +@register_extractor +class LiteralsBagExtractor(BagsExtractor): + NAME = "lit" + NAMESPACE = "l." + OPTS = BagsExtractor.OPTS.copy() + + def __init__(self, docfreq_threshold=None, **kwargs): + super().__init__(docfreq_threshold, **kwargs) + self.id2bag = Literals2Bag(None, HashedTokenParser()) + + def uast_to_bag(self, uast): + if os.getenv("PYTHONHASHSEED", "random") == "random": + raise RuntimeError("PYTHONHASHSEED must be set") + return self.id2bag(uast) diff --git a/sourced/ml/core/extractors/uast_random_walk.py b/sourced/ml/core/extractors/uast_random_walk.py new file mode 100644 index 0000000..b4db69e --- /dev/null +++ b/sourced/ml/core/extractors/uast_random_walk.py @@ -0,0 +1,23 @@ +from sourced.ml.core.algorithms.uast_struct_to_bag import UastRandomWalk2Bag +from sourced.ml.core.extractors.helpers import ( + BagsExtractor, filter_kwargs, get_names_from_kwargs, register_extractor) + + +@register_extractor +class UastRandomWalkBagExtractor(BagsExtractor): + NAME = "node2vec" + NAMESPACE = "r." + OPTS = dict(get_names_from_kwargs(UastRandomWalk2Bag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, UastRandomWalk2Bag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast2bag = UastRandomWalk2Bag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/extractors/uast_seq.py b/sourced/ml/core/extractors/uast_seq.py new file mode 100644 index 0000000..5f02138 --- /dev/null +++ b/sourced/ml/core/extractors/uast_seq.py @@ -0,0 +1,23 @@ +from sourced.ml.core.algorithms.uast_struct_to_bag import UastSeq2Bag +from sourced.ml.core.extractors.helpers import ( + BagsExtractor, filter_kwargs, get_names_from_kwargs, register_extractor) + + +@register_extractor +class UastSeqBagExtractor(BagsExtractor): + NAME = "uast2seq" + NAMESPACE = "s." + OPTS = dict(get_names_from_kwargs(UastSeq2Bag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, UastSeq2Bag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast2bag = UastSeq2Bag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/modelforgecfg.py b/sourced/ml/core/modelforgecfg.py new file mode 100644 index 0000000..5148c1f --- /dev/null +++ b/sourced/ml/core/modelforgecfg.py @@ -0,0 +1,8 @@ +import os + + +VENDOR = "source{d}" +BACKEND = "gcs" +BACKEND_ARGS = "bucket=models.cdn.sourced.tech" +INDEX_REPO = "https://github.com/src-d/models" +CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "source{d}") diff --git a/sourced/ml/core/models/__init__.py b/sourced/ml/core/models/__init__.py new file mode 100644 index 0000000..5edb673 --- /dev/null +++ b/sourced/ml/core/models/__init__.py @@ -0,0 +1,12 @@ +# flake8: noqa +from sourced.ml.core.models.bow import BOW +from sourced.ml.core.models.coocc import Cooccurrences +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies +from sourced.ml.core.models.id2vec import Id2Vec +from sourced.ml.core.models.tensorflow import TensorFlowModel +from sourced.ml.core.models.topics import Topics +from sourced.ml.core.models.quant import QuantizationLevels + +from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq +from sourced.ml.core.models.model_converters.merge_bow import MergeBOW diff --git a/sourced/ml/core/models/bow.py b/sourced/ml/core/models/bow.py new file mode 100644 index 0000000..86f5447 --- /dev/null +++ b/sourced/ml/core/models/bow.py @@ -0,0 +1,131 @@ +import logging +from typing import Dict, Iterable, List + +from modelforge import assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, \ + register_model, split_strings +from modelforge.progress_bar import progress_bar +from scipy import sparse + +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class BOW(Model): + """ + Weighted bag of words model. Every word is correspond to an index and its matrix column. + Bag is a word set from repository, file or anything else. + Word is source code identifier or its part. + This model depends on :class:`sourced.ml.models.DocumentFrequencies`. + """ + NAME = "bow" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains source code as weighted bag of words." + LICENSE = DEFAULT_LICENSE + + def construct(self, documents: List[str], tokens: List[str], matrix: sparse.spmatrix): + if matrix.shape[0] != len(documents): + raise ValueError("matrix shape mismatch, documents %d != %d" % ( + matrix.shape[0], len(documents))) + if matrix.shape[1] != len(tokens): + raise ValueError("matrix shape mismatch, tokens %d != %d" % ( + matrix.shape[1], len(tokens))) + self._documents = documents + self._matrix = matrix + self._tokens = tokens + return self + + def dump(self): + return "Shape: %s\n" \ + "First 10 documents: %s\n" \ + "First 10 tokens: %s" % \ + (self._matrix.shape, self._documents[:10], self.tokens[:10]) + + @property + def matrix(self) -> sparse.spmatrix: + """ + Returns the bags as a sparse matrix. Rows are documents and columns are tokens weight. + """ + return self._matrix + + @property + def documents(self): + """ + The list of documents in the model. + """ + return self._documents + + @property + def tokens(self): + """ + The list of tokens in the model. + """ + return self._tokens + + def __getitem__(self, item: int): + """ + Returns document name, word indices and weights for the given document index. + + :param item: Document index. + :return: (name, :class:`numpy.ndarray` with word indices, \ + :class:`numpy.ndarray` with weights) + """ + data = self._matrix[item] + return self._documents[item], data.indices, data.data + + def __iter__(self): + """ + Returns an iterator over the document indices. + """ + return iter(range(len(self))) + + def __len__(self): + """ + Returns the number of documents. + """ + return len(self._documents) + + def save(self, output: str, series: str, deps: Iterable = tuple(), + create_missing_dirs: bool = True): + if not deps: + try: + deps = [self.get_dep(DocumentFrequencies.NAME)] + except KeyError: + raise ValueError( + "You must specify DocumentFrequencies dependency to save BOW.") from None + super().save(output=output, series=series, deps=deps, + create_missing_dirs=create_missing_dirs) + + def convert_bow_to_vw(self, output: str): + log = logging.getLogger("bow2vw") + log.info("Writing %s", output) + with open(output, "w") as fout: + for index in progress_bar(self, log, expected_size=len(self)): + record = self[index] + fout.write(record[0].replace(":", "").replace(" ", "_") + " ") + pairs = [] + for t, v in zip(*record[1:]): + try: + word = self.tokens[t] + except (KeyError, IndexError): + log.warning("%d not found in the vocabulary", t) + continue + pairs.append("%s:%s" % (word, v)) + fout.write(" ".join(pairs)) + fout.write("\n") + + def documents_index(self) -> Dict[str, int]: + return {r: i for i, r in enumerate(self._documents)} + + def _generate_tree(self): + return {"documents": merge_strings(self._documents), + "matrix": disassemble_sparse_matrix(self._matrix), + "tokens": merge_strings(self.tokens)} + + def _load_tree_kwargs(self, tree: dict): + return {"documents": split_strings(tree["documents"]), + "matrix": assemble_sparse_matrix(tree["matrix"]), + "tokens": split_strings(tree["tokens"])} + + def _load_tree(self, tree: dict): + self.construct(**self._load_tree_kwargs(tree)) diff --git a/sourced/ml/core/models/coocc.py b/sourced/ml/core/models/coocc.py new file mode 100644 index 0000000..66f2dfc --- /dev/null +++ b/sourced/ml/core/models/coocc.py @@ -0,0 +1,62 @@ +from modelforge.model import ( + assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, split_strings) +from modelforge.models import register_model + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Cooccurrences(Model): + """ + Co-occurrence matrix. + """ + NAME = "co-occurrences" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains the sparse co-occurrence matrix of source code identifiers." + LICENSE = DEFAULT_LICENSE + + def construct(self, tokens, matrix): + self._tokens = tokens + self._matrix = matrix + return self + + def _load_tree(self, tree): + self.construct(tokens=split_strings(tree["tokens"]), + matrix=assemble_sparse_matrix(tree["matrix"])) + + def dump(self): + return """Number of words: %d +First 10 words: %s +Matrix: shape: %s non-zero: %d""" % ( + len(self.tokens), self.tokens[:10], self.matrix.shape, self.matrix.getnnz()) + + @property + def tokens(self): + """ + Returns the tokens in the order which corresponds to the matrix's rows and cols. + """ + return self._tokens + + @property + def matrix(self): + """ + Returns the sparse co-occurrence matrix. + """ + return self._matrix + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._tokens) + + def _generate_tree(self): + return {"tokens": merge_strings(self.tokens), + "matrix": disassemble_sparse_matrix(self.matrix)} + + def matrix_to_rdd(self, spark_context: "pyspark.SparkContext") -> "pyspark.RDD": + self._log.info("Convert coocc model to RDD...") + rdd_row = spark_context.parallelize(self._matrix.row) + rdd_col = spark_context.parallelize(self._matrix.col) + rdd_data = spark_context.parallelize(self._matrix.data) + return rdd_row.zip(rdd_col).zip(rdd_data) diff --git a/sourced/ml/core/models/df.py b/sourced/ml/core/models/df.py new file mode 100644 index 0000000..fad83d4 --- /dev/null +++ b/sourced/ml/core/models/df.py @@ -0,0 +1,169 @@ +from itertools import islice +from typing import Dict, Iterable, List, Union + +from modelforge import merge_strings, Model, register_model, split_strings +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class DocumentFrequencies(Model): + """ + Document frequencies - number of times a source code identifier appeared + in different repositories. Each repository counts only once. + """ + NAME = "docfreq" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains document frequencies of features extracted from code." + LICENSE = DEFAULT_LICENSE + + def construct(self, docs: int, tokfreqs: Union[Iterable[Dict[str, int]], Dict[str, int]]): + """ + Initializes this model. + :param docs: The number of documents. + :param tokfreqs: The dictionary of token -> frequency or the iterable collection of such + dictionaries. + :return: self + """ + if isinstance(tokfreqs, dict): + df = tokfreqs + else: + df = {} + for d in tokfreqs: + df.update(d) + self._docs = docs + self._df = df + return self + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ + + def _load_tree(self, tree: dict, tokens=None): + if tokens is None: + tokens = split_strings(tree["tokens"]) + freqs = tree["freqs"] + self._log.info("Building the docfreq dictionary...") + tokfreq = dict(zip(tokens, freqs)) + self.construct(docs=tree["docs"], tokfreqs=tokfreq) + + def _generate_tree(self): + tokens = self.tokens() + freqs = numpy.array([self._df[t] for t in tokens], dtype=numpy.float32) + return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs} + + def dump(self): + return """Number of words: %d +Random 10 words: %s +Number of documents: %d""" % ( + len(self._df), dict(islice(self._df.items(), 10)), self.docs) + + @property + def docs(self) -> int: + """ + Returns the number of documents. + """ + return self._docs + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ + + def prune(self, threshold: int) -> "DocumentFrequencies": + """ + Removes tokens which occur less than `threshold` times. + The operation happens *not* in-place - a new model is returned. + :param threshold: Minimum number of occurrences. + :return: The new model if the current one had to be changed, otherwise self. + """ + if threshold < 1: + raise ValueError("Invalid threshold: %d" % threshold) + if threshold == 1: + return self + self._log.info("Pruning to min %d occurrences", threshold) + pruned = type(self)() + pruned._docs = self.docs + pruned._df = {k: v for k, v in self._df.items() if v >= threshold} + self._log.info("Size: %d -> %d", len(self), len(pruned)) + pruned._meta = self.meta + return pruned + + def greatest(self, max_size: int) -> "DocumentFrequencies": + """ + Truncates the model to most frequent `max_size` tokens. + The operation happens *not* in-place - a new model is returned. + :param max_size: The maximum vocabulary size. + :return: The new model if the current one had to be changed, otherwise self. + """ + if max_size < 1: + raise ValueError("Invalid max_size: %d" % max_size) + if len(self) <= max_size: + return self + self._log.info("Pruning to max %d size", max_size) + pruned = type(self)() + pruned._docs = self.docs + freqs = numpy.fromiter(self._df.values(), dtype=numpy.int32, count=len(self)) + keys = numpy.array(list(self._df.keys()), dtype=object) + chosen = numpy.argpartition(freqs, len(freqs) - max_size)[len(freqs) - max_size:] + border_freq = freqs[chosen].min() + chosen = freqs >= border_freq + # argpartition can leave some of the elements with freq == border_freq outside + # so next step ensures that we include everything. + freqs = freqs[chosen] + keys = keys[chosen] + # we need to be deterministic at the cutoff frequency + # argpartition returns random samples every time + # so we treat words with the cutoff frequency separately + if max_size != freqs.shape[0]: + assert max_size < freqs.shape[0] + border_freq_indexes = freqs == border_freq + border_keys = keys[border_freq_indexes] + border_keys.sort() + border_keys = border_keys[:max_size - freqs.shape[0]] + df = dict(zip(keys[~border_freq_indexes], freqs[~border_freq_indexes])) + df.update({key: border_freq for key in border_keys}) + else: + df = dict(zip(keys, freqs)) + pruned._df = df + self._log.info("Size: %d -> %d", len(self), len(pruned)) + pruned._meta = self.meta + return pruned + + def __getitem__(self, item): + return self._df[item] + + def __iter__(self): + return iter(self._df.items()) + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._df) + + def get(self, item, default=None) -> Union[int, None]: + """ + Return the document frequency for a given token. + + :param item: The token to query. + :param default: Returned value in case the token is missing. + :return: int or `default` + """ + return self._df.get(item, default) + + def tokens(self) -> List[str]: + """ + Returns the list of tokens. + """ + return list(self._df) + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ diff --git a/sourced/ml/core/models/id2vec.py b/sourced/ml/core/models/id2vec.py new file mode 100644 index 0000000..782ae23 --- /dev/null +++ b/sourced/ml/core/models/id2vec.py @@ -0,0 +1,66 @@ +from modelforge import merge_strings, Model, register_model, split_strings + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Id2Vec(Model): + """ + id2vec model - source code identifier embeddings. + """ + NAME = "id2vec" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains information on source code as identifier embeddings." + LICENSE = DEFAULT_LICENSE + + def construct(self, embeddings, tokens): + self._embeddings = embeddings + self._tokens = tokens + self._log.info("Building the token index...") + self._token2index = {w: i for i, w in enumerate(self._tokens)} + return self + + def _load_tree(self, tree): + self.construct(embeddings=tree["embeddings"].copy(), + tokens=split_strings(tree["tokens"])) + + def dump(self): + return """Shape: %s +First 10 words: %s""" % ( + self.embeddings.shape, self.tokens[:10]) + + @property + def embeddings(self): + """ + :class:`numpy.ndarray` with the embeddings of shape + (N tokens x embedding dims). + """ + return self._embeddings + + @property + def tokens(self): + """ + List with the processed source code identifiers. + """ + return self._tokens + + def items(self): + """ + Returns the tuples belonging to token -> index mapping. + """ + return self._token2index.items() + + def __getitem__(self, item): + """ + Returns the index of the specified processed source code identifier. + """ + return self._token2index[item] + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._tokens) + + def _generate_tree(self): + return {"embeddings": self.embeddings, "tokens": merge_strings(self.tokens)} diff --git a/sourced/ml/core/models/license.py b/sourced/ml/core/models/license.py new file mode 100644 index 0000000..9ebe479 --- /dev/null +++ b/sourced/ml/core/models/license.py @@ -0,0 +1,3 @@ +"""Default license used for the models.""" + +DEFAULT_LICENSE = "ODbL-1.0" diff --git a/sourced/ml/core/models/model_converters/__init__.py b/sourced/ml/core/models/model_converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/models/model_converters/base.py b/sourced/ml/core/models/model_converters/base.py new file mode 100644 index 0000000..bcc0ee2 --- /dev/null +++ b/sourced/ml/core/models/model_converters/base.py @@ -0,0 +1,115 @@ +import logging +import multiprocessing +import os +from typing import List, Union + +from modelforge import Model +from modelforge.progress_bar import progress_bar + +from sourced.ml.core.utils.pickleable_logger import PickleableLogger + + +class Model2Base(PickleableLogger): + """ + Base class for model -> model conversions. + """ + MODEL_FROM_CLASS = None + MODEL_TO_CLASS = None + + def __init__(self, num_processes: int = 0, + log_level: int = logging.DEBUG, overwrite_existing: bool = True): + """ + Initializes a new instance of Model2Base class. + + :param num_processes: The number of processes to execute for conversion. + :param log_level: Logging verbosity level. + :param overwrite_existing: Rewrite existing models or skip them. + """ + super().__init__(log_level=log_level) + self.num_processes = multiprocessing.cpu_count() if num_processes == 0 else num_processes + self.overwrite_existing = overwrite_existing + + def convert(self, models_path: List[str], destdir: str) -> int: + """ + Performs the model -> model conversion. Runs the conversions in a pool of processes. + + :param models_path: List of Models path. + :param destdir: The directory where to store the models. The directory structure is \ + preserved. + :return: The number of converted files. + """ + files = list(models_path) + self._log.info("Found %d files", len(files)) + if not files: + return 0 + queue_in = multiprocessing.Manager().Queue() + queue_out = multiprocessing.Manager().Queue(1) + processes = [multiprocessing.Process(target=self._process_entry, + args=(i, destdir, queue_in, queue_out)) + for i in range(self.num_processes)] + for p in processes: + p.start() + for f in files: + queue_in.put(f) + for _ in processes: + queue_in.put(None) + failures = 0 + for _ in progress_bar(files, self._log, expected_size=len(files)): + filename, ok = queue_out.get() + if not ok: + failures += 1 + for p in processes: + p.join() + self._log.info("Finished, %d failed files", failures) + return len(files) - failures + + def convert_model(self, model: Model) -> Union[Model, None]: + """ + This must be implemented in the child classes. + + :param model: The model instance to convert. + :return: The converted model instance or None if it is not needed. + """ + raise NotImplementedError + + def finalize(self, index: int, destdir: str): + """ + Called for each worker in the end of the processing. + + :param index: Worker's index. + :param destdir: The directory where to store the models. + """ + pass + + def _process_entry(self, index, destdir, queue_in, queue_out): + while True: + filepath = queue_in.get() + if filepath is None: + break + try: + model_path = os.path.join(destdir, os.path.split(filepath)[1]) + if os.path.exists(model_path): + if self.overwrite_existing: + self._log.warning( + "Model %s already exists, but will be overwrite. If you want to " + "skip existing models use --disable-overwrite flag", model_path) + else: + self._log.warning("Model %s already exists, skipping.", model_path) + queue_out.put((filepath, True)) + continue + model_from = self.MODEL_FROM_CLASS(log_level=self._log.level).load(filepath) + model_to = self.convert_model(model_from) + if model_to is not None: + dirs = os.path.dirname(model_path) + if dirs: + os.makedirs(dirs, exist_ok=True) + model_to.save(model_path, deps=model_to.meta["dependencies"]) + except: # noqa + self._log.exception("%s failed", filepath) + queue_out.put((filepath, False)) + else: + queue_out.put((filepath, True)) + self.finalize(index, destdir) + + def _get_log_name(self): + return "%s2%s" % (self.MODEL_FROM_CLASS.NAME, self.MODEL_TO_CLASS.NAME) diff --git a/sourced/ml/core/models/model_converters/merge_bow.py b/sourced/ml/core/models/model_converters/merge_bow.py new file mode 100644 index 0000000..305ccb1 --- /dev/null +++ b/sourced/ml/core/models/model_converters/merge_bow.py @@ -0,0 +1,69 @@ +import os + +from scipy.sparse import vstack + +from sourced.ml.core import extractors +from sourced.ml.core.models.bow import BOW +from sourced.ml.core.models.model_converters.base import Model2Base + + +class MergeBOW(Model2Base): + """ + Merges several :class:`BOW` models together. + """ + MODEL_FROM_CLASS = BOW + MODEL_TO_CLASS = BOW + + def __init__(self, features=None, *args, **kwargs): + super().__init__(num_processes=1, *args, **kwargs) + self.documents = None + self.tokens = None + self.matrix = None + self.deps = None + self.features_namespaces = None + if features: + self.features_namespaces = [ex.NAMESPACE for ex in extractors.__extractors__.values() + if ex.NAME in features] + + def convert_model(self, model: BOW) -> None: + if self.tokens is None: + self.tokens = model.tokens + self.documents = model.documents + self.matrix = [model.matrix.tocsr()] + self.deps = model._meta["dependencies"] + elif set(self.tokens) != set(model.tokens): + raise ValueError("Models don't share the same set of tokens !") + else: + self.documents += model.documents + self.matrix.append(model.matrix.tocsr()) + + def finalize(self, index: int, destdir: str): + self._log.info("Stacking matrices ...") + matrix = self.matrix.pop(0) + while self.matrix: + matrix = vstack([matrix, self.matrix.pop(0)]) + self._log.info("%s matrices to stack ...", len(self.matrix)) + self.matrix = matrix + self._log.info("Writing model ...") + if self.features_namespaces: + self._reduce_matrix() + BOW(log_level=self._log.level) \ + .construct(self.documents, self.tokens, self.matrix) \ + .save(output=self._save_path(index, destdir), series="id2vec", deps=self.deps) + + def _reduce_matrix(self): + reduced_tokens = [] + columns = [] + matrix = self.matrix.tocsc() + for i, token in enumerate(self.tokens): + if token.split(".")[0] in self.features_namespaces: + reduced_tokens.append(token) + columns.append(i) + self.tokens = reduced_tokens + self.matrix = matrix[:, columns] + + @staticmethod + def _save_path(index: int, destdir: str): + if destdir.endswith(".asdf"): + return destdir + return os.path.join(destdir, "bow_%d.asdf" % index) diff --git a/sourced/ml/core/models/model_converters/merge_df.py b/sourced/ml/core/models/model_converters/merge_df.py new file mode 100644 index 0000000..68f0a58 --- /dev/null +++ b/sourced/ml/core/models/model_converters/merge_df.py @@ -0,0 +1,42 @@ +from collections import defaultdict +import os + +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.model_converters.base import Model2Base +from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies + + +class MergeDocFreq(Model2Base): + """ + Merges several :class:`DocumentFrequencies` models together. + """ + MODEL_FROM_CLASS = DocumentFrequencies + MODEL_TO_CLASS = DocumentFrequencies + + def __init__(self, min_docfreq: int, vocabulary_size: int, ordered: bool = False, + *args, **kwargs): + super().__init__(num_processes=1, *args, **kwargs) + self.ordered = ordered + self.min_docfreq = min_docfreq + self.vocabulary_size = vocabulary_size + self._df = defaultdict(int) + self._docs = 0 + + def convert_model(self, model: DocumentFrequencies) -> None: + for word, freq in model: + self._df[word] += freq + self._docs += model.docs + + def finalize(self, index: int, destdir: str): + df_model = OrderedDocumentFrequencies if self.ordered else DocumentFrequencies + df_model(log_level=self._log.level) \ + .construct(self._docs, self._df) \ + .prune(self.min_docfreq) \ + .greatest(self.vocabulary_size) \ + .save(output=self._save_path(index, destdir), series="id2vec") + + @staticmethod + def _save_path(index: int, destdir: str): + if destdir.endswith(".asdf"): + return destdir + return os.path.join(destdir, "docfreq_%d.asdf" % index) diff --git a/sourced/ml/core/models/ordered_df.py b/sourced/ml/core/models/ordered_df.py new file mode 100644 index 0000000..ecfd6ed --- /dev/null +++ b/sourced/ml/core/models/ordered_df.py @@ -0,0 +1,60 @@ +from typing import Dict, Iterable, List + +from modelforge import merge_strings, register_model, split_strings +import numpy + +from sourced.ml.core.models import DocumentFrequencies + + +@register_model +class OrderedDocumentFrequencies(DocumentFrequencies): + """ + Compatible with the original DocumentFrequencies. This model maintains the determinitic + sequence of the tokens. + """ + # NAME is the same + + def construct(self, docs: int, tokfreqs: Iterable[Dict[str, int]]): + super().construct(docs, tokfreqs) + self._log.info("Ordering the keys...") + keys = sorted(self._df) + self._order = {k: i for i, k in enumerate(keys)} + return self + + @property + def order(self) -> Dict[str, int]: + return self._order + + def tokens(self) -> List[str]: + arr = [None for _ in range(len(self))] + for k, v in self.order.items(): + arr[v] = k + return arr + + def _load_tree(self, tree): + tokens = split_strings(tree["tokens"]) + super()._load_tree(tree, tokens) + self._log.info("Mapping the keys order...") + self._order = {k: i for i, k in enumerate(tokens)} + + def _generate_tree(self): + tokens = [None] * len(self) + freqs = numpy.zeros(len(self), dtype=numpy.float32) + for k, i in self._order.items(): + tokens[i] = k + freqs[i] = self._df[k] + return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs} + + def prune(self, threshold: int) -> "OrderedDocumentFrequencies": + pruned = super().prune(threshold) + if pruned is not self: + self._log.info("Recovering the order...") + pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))} + return pruned + + def greatest(self, max_size: int) -> "OrderedDocumentFrequencies": + pruned = super().greatest(max_size) + if pruned is not self: + self._log.info("Recovering the order...") + pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))} + return pruned diff --git a/sourced/ml/core/models/quant.py b/sourced/ml/core/models/quant.py new file mode 100644 index 0000000..f56e253 --- /dev/null +++ b/sourced/ml/core/models/quant.py @@ -0,0 +1,62 @@ +from typing import Dict + +from modelforge import merge_strings, Model, register_model, split_strings +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class QuantizationLevels(Model): + """ + This model contains quantization levels for multiple schemes (feature types). + Every feature "class" (type, possible distinct value) corresponds to the numpy array + with integer level borders. The size of each numpy array is (the number of levels + 1). + """ + NAME = "quant" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains quantization levels for multiple schemes (feature types)." + LICENSE = DEFAULT_LICENSE + + def construct(self, levels: Dict[str, Dict[str, numpy.ndarray]]): + self._levels = levels + return self + + @property + def levels(self) -> Dict[str, Dict[str, numpy.ndarray]]: + return self._levels + + def __len__(self): + return len(self.levels) + + def _load_tree(self, tree): + self._levels = {} + for key, vals in tree["schemes"].items(): + classes = split_strings(vals["classes"]) + levels = vals["levels"] + self.levels[key] = dict(zip(classes, numpy.split(levels, len(classes)))) + + def _generate_tree(self): + tree = {"schemes": {}} + for key, vals in self.levels.items(): + tree["schemes"][key] = scheme = {} + npartitions = len(next(iter(vals.values()))) + classes = [None for _ in range(len(vals))] + scheme["levels"] = levels = numpy.zeros(len(vals) * npartitions, dtype=numpy.int32) + for i, pair in enumerate(vals.items()): + classes[i], levels[i * npartitions:(i + 1) * npartitions] = pair + scheme["classes"] = merge_strings(classes) + return tree + + def dump(self): + return """Schemes: %s""" % ( + sorted((v[0], "%d@%d" % (len(v[1]), len(next(iter(v[1].values()))) - 1)) + for v in self.levels.items())) + + def apply_quantization(self, extractors): + for extractor in extractors: + try: + extractor.quantize + except AttributeError: + continue + extractor.uast_to_bag.levels = self._levels[extractor.NAME] diff --git a/sourced/ml/core/models/tensorflow.py b/sourced/ml/core/models/tensorflow.py new file mode 100644 index 0000000..a45f97b --- /dev/null +++ b/sourced/ml/core/models/tensorflow.py @@ -0,0 +1,50 @@ +from typing import List + +from modelforge import Model, register_model +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class TensorFlowModel(Model): + """ + TensorFlow Protobuf model exported in the Modelforge format with GraphDef inside. + """ + NAME = "tensorflow-model" + VENDOR = "source{d}" + DESCRIPTION = "TensorFlow Protobuf model that contains a GraphDef instance." + LICENSE = DEFAULT_LICENSE + + def construct(self, graphdef: "tensorflow.GraphDef" = None, # noqa: F821 + session: "tensorflow.Session" = None, # noqa: F821 + outputs: List[str] = None): + if graphdef is None: + assert session is not None + assert outputs is not None + graphdef = session.graph_def + from tensorflow.python.framework import graph_util + for node in graphdef.node: + node.device = "" + graphdef = graph_util.convert_variables_to_constants( + session, graphdef, outputs) + self._graphdef = graphdef + return self + + @property + def graphdef(self): + """ + Returns the wrapped TensorFlow GraphDef. + """ + return self._graphdef + + def _generate_tree(self) -> dict: + return {"graphdef": numpy.frombuffer(self._graphdef.SerializeToString(), + dtype=numpy.uint8)} + + def _load_tree(self, tree: dict): + from tensorflow.core.framework import graph_pb2 + + graphdef = graph_pb2.GraphDef() + graphdef.ParseFromString(tree["graphdef"].data) + self.construct(graphdef=graphdef) diff --git a/sourced/ml/core/models/topics.py b/sourced/ml/core/models/topics.py new file mode 100644 index 0000000..95cf873 --- /dev/null +++ b/sourced/ml/core/models/topics.py @@ -0,0 +1,86 @@ +from typing import Union + +from modelforge import assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, \ + Model, register_model, split_strings + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Topics(Model): + NAME = "topics" + VENDOR = "source{d}" + DESCRIPTION = "Model that is used to identify topics of source code repositories." + LICENSE = DEFAULT_LICENSE + + @property + def tokens(self): + return self._tokens + + @property + def topics(self): + """ + May be None if no topics are labeled. + """ + return self._topics + + @property + def matrix(self): + """ + Rows: tokens + Columns: topics + """ + return self._matrix + + def construct(self, tokens: list, topics: Union[list, None], matrix): + if len(tokens) != matrix.shape[1]: + raise ValueError("Tokens and matrix do not match.") + self._tokens = tokens + self._topics = topics + self._matrix = matrix + return self + + def _load_tree(self, tree: dict) -> None: + self.construct(split_strings(tree["tokens"]), + split_strings(tree["topics"]) if tree["topics"] else None, + assemble_sparse_matrix(tree["matrix"])) + + def dump(self) -> str: + res = "%d topics, %d tokens\nFirst 10 tokens: %s\nTopics: " % ( + self.matrix.shape + (self.tokens[:10],)) + if self.topics is not None: + res += "labeled, first 10: %s\n" % self.topics[:10] + else: + res += "unlabeled\n" + nnz = self.matrix.getnnz() + res += "non-zero elements: %d (%f)" % ( + nnz, nnz / (self.matrix.shape[0] * self.matrix.shape[1])) + return res + + def _generate_tree(self): + return {"tokens": merge_strings(self.tokens), + "topics": merge_strings(self.topics) if self.topics is not None else False, + "matrix": disassemble_sparse_matrix(self.matrix)} + + def __len__(self): + """ + Returns the number of topics. + """ + return self.matrix.shape[0] + + def __getitem__(self, item): + """ + Returns the keywords sorted by significance from topic index. + """ + row = self.matrix[item] + nnz = row.nonzero()[1] + pairs = [(-row[0, i], i) for i in nnz] + pairs.sort() + return [(self.tokens[pair[1]], -pair[0]) for pair in pairs] + + def label_topics(self, labels): + if len(labels) != len(self): + raise ValueError("Sizes do not match: %d != %d" % (len(labels), len(self))) + if not isinstance(labels[0], str): + raise TypeError("Labels must be strings") + self._topics = list(labels) diff --git a/sourced/ml/core/tests/.gitignore b/sourced/ml/core/tests/.gitignore new file mode 100644 index 0000000..559a840 --- /dev/null +++ b/sourced/ml/core/tests/.gitignore @@ -0,0 +1 @@ +swivel/shard-000-000.pb \ No newline at end of file diff --git a/sourced/ml/core/tests/__init__.py b/sourced/ml/core/tests/__init__.py new file mode 100644 index 0000000..1f8dc17 --- /dev/null +++ b/sourced/ml/core/tests/__init__.py @@ -0,0 +1,24 @@ +import sys + +from modelforge import slogging + + +utmain = sys.modules["__main__"] +if utmain.__package__ == "unittest" and utmain.__spec__ is None: + from collections import namedtuple + ModuleSpec = namedtuple("ModuleSpec", ["name"]) + utmain.__spec__ = ModuleSpec("unittest.__main__") + del ModuleSpec +del utmain + + +def has_tensorflow(): + try: + import tensorflow # noqa + return True + except ImportError: + return False + + +def setup(): + slogging.setup("INFO", False) diff --git a/sourced/ml/core/tests/asdf/bow.asdf b/sourced/ml/core/tests/asdf/bow.asdf new file mode 100644 index 0000000..26b8ea0 Binary files /dev/null and b/sourced/ml/core/tests/asdf/bow.asdf differ diff --git a/sourced/ml/core/tests/asdf/coocc.asdf b/sourced/ml/core/tests/asdf/coocc.asdf new file mode 100644 index 0000000..9498b99 Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc.asdf differ diff --git a/sourced/ml/core/tests/asdf/coocc_df.asdf b/sourced/ml/core/tests/asdf/coocc_df.asdf new file mode 100644 index 0000000..b40f5d7 Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc_df.asdf differ diff --git a/sourced/ml/core/tests/asdf/docfreq_1000.asdf b/sourced/ml/core/tests/asdf/docfreq_1000.asdf new file mode 100644 index 0000000..2fa308d Binary files /dev/null and b/sourced/ml/core/tests/asdf/docfreq_1000.asdf differ diff --git a/sourced/ml/core/tests/asdf/id2vec_1000.asdf b/sourced/ml/core/tests/asdf/id2vec_1000.asdf new file mode 100644 index 0000000..d410d31 Binary files /dev/null and b/sourced/ml/core/tests/asdf/id2vec_1000.asdf differ diff --git a/sourced/ml/core/tests/asdf/quant.asdf b/sourced/ml/core/tests/asdf/quant.asdf new file mode 100644 index 0000000..793c4ba Binary files /dev/null and b/sourced/ml/core/tests/asdf/quant.asdf differ diff --git a/sourced/ml/core/tests/asdf/topics.asdf b/sourced/ml/core/tests/asdf/topics.asdf new file mode 100644 index 0000000..50b89dd Binary files /dev/null and b/sourced/ml/core/tests/asdf/topics.asdf differ diff --git a/sourced/ml/core/tests/asdf/uast.asdf b/sourced/ml/core/tests/asdf/uast.asdf new file mode 100644 index 0000000..8ca7458 Binary files /dev/null and b/sourced/ml/core/tests/asdf/uast.asdf differ diff --git a/sourced/ml/core/tests/asdf/voccoocc.asdf b/sourced/ml/core/tests/asdf/voccoocc.asdf new file mode 100644 index 0000000..835ab91 Binary files /dev/null and b/sourced/ml/core/tests/asdf/voccoocc.asdf differ diff --git a/sourced/ml/core/tests/identifiers.csv.tar.gz b/sourced/ml/core/tests/identifiers.csv.tar.gz new file mode 100644 index 0000000..4fac851 Binary files /dev/null and b/sourced/ml/core/tests/identifiers.csv.tar.gz differ diff --git a/sourced/ml/core/tests/models.py b/sourced/ml/core/tests/models.py new file mode 100644 index 0000000..6eeb379 --- /dev/null +++ b/sourced/ml/core/tests/models.py @@ -0,0 +1,25 @@ +from os.path import dirname, join + +_root = dirname(__file__) +_models_path = join(_root, "asdf") + +ID2VEC = join(_models_path, "id2vec_1000.asdf") +DOCFREQ = join(_models_path, "docfreq_1000.asdf") +QUANTLEVELS = join(_models_path, "quant.asdf") +BOW = join(_models_path, "bow.asdf") +COOCC = join(_models_path, "coocc.asdf") +COOCC_DF = join(_models_path, "coocc_df.asdf") +UAST = join(_models_path, "uast.asdf") +TOPICS = join(_models_path, "topics.asdf") + +DATA_DIR_SOURCE = join(_root, "source") +SOURCE_FILENAME = "example" +SOURCE = join(DATA_DIR_SOURCE, "%s.asdf" % SOURCE_FILENAME) +SOURCE_PY = join(DATA_DIR_SOURCE, "%s.py" % SOURCE_FILENAME) + +TOPICS_SRC = "topics_readable.txt" +PARQUET_DIR = join(_root, "parquet") +SIVA_DIR = join(_root, "siva") +IDENTIFIERS = join(_root, "identifiers.csv.tar.gz") + +MODER_FUNC = join(DATA_DIR_SOURCE, "example_functions.py") diff --git a/sourced/ml/core/tests/models/test_tensorflow.py b/sourced/ml/core/tests/models/test_tensorflow.py new file mode 100644 index 0000000..f33d441 --- /dev/null +++ b/sourced/ml/core/tests/models/test_tensorflow.py @@ -0,0 +1,32 @@ +import io +import unittest + +from sourced.ml.core.models.tensorflow import TensorFlowModel +from sourced.ml.core.tests import has_tensorflow + + +class TensorFlowModelTests(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_serialize(self): + import tensorflow as tf + a = tf.constant([[1, 0], [0, 1]]) + b = tf.constant([[0, 1], [1, 0]]) + c = tf.matmul(a, b) + gd = tf.get_default_graph().as_graph_def() + buffer = io.BytesIO() + TensorFlowModel().construct(graphdef=gd).save(buffer, series="tensorflow-model") + buffer.seek(0) + model = TensorFlowModel().load(buffer) + self.assertEqual(gd.node, model.graphdef.node) + + buffer = io.BytesIO() + with tf.Session() as session: + TensorFlowModel().construct(session=session, outputs=[c.name[:-2]]).save( + buffer, series="tensorflow-model") + buffer.seek(0) + model = TensorFlowModel().load(buffer) + self.assertEqual(gd.node, model.graphdef.node) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/source/__init__.py b/sourced/ml/core/tests/source/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/tests/source/example.py b/sourced/ml/core/tests/source/example.py new file mode 100644 index 0000000..11a01bc --- /dev/null +++ b/sourced/ml/core/tests/source/example.py @@ -0,0 +1,16 @@ +import sys + +from modelforge.logs import setup_logging + + +utmain = sys.modules["__main__"] +if utmain.__package__ == "unittest" and utmain.__spec__ is None: + from collections import namedtuple + ModuleSpec = namedtuple("ModuleSpec", ["name"]) + utmain.__spec__ = ModuleSpec("unittest.__main__") + del ModuleSpec +del utmain + + +def setup(): + setup_logging("INFO") diff --git a/sourced/ml/core/tests/source/example_functions.py b/sourced/ml/core/tests/source/example_functions.py new file mode 100644 index 0000000..fd7d05a --- /dev/null +++ b/sourced/ml/core/tests/source/example_functions.py @@ -0,0 +1,16 @@ +class Foo: + def func_a(self): + # should be counted + pass + + +def func_b(): + # should be counted + pass + + +def func_c(): + # should be counted + def func_d(): + # should not be counted + pass diff --git a/sourced/ml/core/tests/swivel/col_sums.txt b/sourced/ml/core/tests/swivel/col_sums.txt new file mode 100644 index 0000000..44ea5c6 --- /dev/null +++ b/sourced/ml/core/tests/swivel/col_sums.txt @@ -0,0 +1,304 @@ +21 +58 +76 +59 +38 +92 +102 +102 +58 +26 +23 +44 +77 +68 +50 +49 +76 +25 +76 +49 +37 +44 +61 +75 +30 +90 +79 +102 +12 +3 +27 +74 +57 +62 +59 +58 +51 +100 +34 +75 +32 +113 +55 +19 +43 +65 +41 +49 +49 +39 +22 +22 +99 +37 +16 +29 +86 +49 +13 +14 +28 +44 +80 +50 +29 +20 +13 +154 +24 +76 +62 +70 +128 +27 +21 +39 +11 +45 +43 +124 +92 +80 +141 +88 +23 +24 +50 +52 +175 +43 +115 +12 +29 +16 +49 +48 +19 +95 +10 +63 +37 +102 +59 +21 +106 +76 +65 +50 +95 +59 +26 +29 +23 +21 +91 +78 +8 +78 +142 +50 +76 +112 +76 +61 +37 +133 +55 +93 +37 +19 +13 +16 +20 +32 +31 +31 +31 +64 +13 +58 +11 +21 +198 +77 +39 +50 +7 +59 +111 +12 +50 +102 +29 +141 +55 +58 +13 +39 +30 +22 +10 +27 +60 +40 +4 +58 +50 +76 +52 +74 +41 +59 +40 +74 +40 +40 +156 +73 +16 +32 +34 +31 +27 +93 +58 +31 +27 +19 +28 +64 +82 +44 +37 +37 +31 +62 +39 +95 +205 +25 +14 +18 +95 +26 +56 +10 +29 +59 +74 +24 +72 +19 +42 +18 +64 +33 +34 +54 +41 +51 +74 +41 +12 +9 +35 +25 +73 +39 +11 +76 +33 +9 +36 +52 +72 +27 +62 +45 +26 +149 +104 +64 +24 +64 +19 +26 +34 +21 +21 +22 +22 +97 +15 +61 +70 +27 +22 +85 +20 +107 +100 +104 +78 +17 +63 +40 +11 +141 +27 +30 +24 +78 +167 +32 +19 +89 +59 +46 +22 +6 +55 +50 +79 +5 +38 +12 +50 +97 +78 +29 +55 +72 +17 +95 +76 +59 +76 +61 +9 +45 +26 +32 +107 +19 \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/col_sums.txt.gz b/sourced/ml/core/tests/swivel/col_sums.txt.gz new file mode 100644 index 0000000..081051d Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_sums.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt b/sourced/ml/core/tests/swivel/col_vocab.txt new file mode 100644 index 0000000..78d2478 --- /dev/null +++ b/sourced/ml/core/tests/swivel/col_vocab.txt @@ -0,0 +1,304 @@ +i.access +i.action +i.activ +i.adapt +i.add +i.android +i.antonioleiva +i.app +i.append +i.appgroup +i.args +i.argument +i.argv +i.array +i.arrays +i.aslist +i.bar +i.build +i.bundle +i.button +i.call +i.captur +i.chdir +i.check +i.clear +i.click +i.close +i.com +i.command +i.communic +i.compil +i.confdir +i.conffil +i.conffilenam +i.config +i.conn +i.connect +i.content +i.copytre +i.count +i.counter +i.create +i.credenti +i.crypt +i.cursor +i.data +i.date +i.datetim +i.dbfile +i.delay +i.destdir +i.destfil +i.destroy +i.dev +i.develop +i.dict +i.dir +i.directori +i.dirnam +i.dirpath +i.dirs +i.docs +i.dropbox +i.edit +i.empty +i.endswith +i.env +i.error +i.euroclear +i.except +i.execut +i.exists +i.exit +i.expandus +i.ext +i.extens +i.factori +i.false +i.fetchon +i.file +i.filenam +i.files +i.find +i.finish +i.float +i.fname +i.for +i.format +i.get +i.getcwd +i.getenv +i.gethostbynam +i.getsiz +i.github +i.gone +i.handler +i.header +i.hide +i.hkey +i.home +i.host +i.impl +i.inflat +i.info +i.input +i.instanc +i.int +i.intent +i.interactor +i.invis +i.isdir +i.isempti +i.isfile +i.isoformat +i.item +i.items +i.iter +i.java +i.join +i.key +i.layout +i.len +i.length +i.line +i.linux +i.list +i.listdir +i.listen +i.listfil +i.lite +i.ljust +i.local +i.localtim +i.log +i.logdir +i.logfil +i.logfilenam +i.login +i.logsdir +i.long +i.lower +i.machin +i.main +i.make +i.makedir +i.master +i.math +i.menu +i.messag +i.min +i.move +i.mvpexampl +i.myping +i.name +i.navig +i.new +i.newenv +i.newfil +i.newscript +i.nmap +i.nmscan +i.node +i.none +i.number +i.object +i.old +i.onclick +i.oncreat +i.onfinish +i.onitem +i.onlogin +i.onopt +i.onpassword +i.onresum +i.onsuccess +i.onusernam +i.open +i.option +i.optpars +i.ospath +i.output +i.outputdir +i.outputfil +i.overrid +i.parent +i.parse +i.parser +i.pass +i.passwd +i.password +i.path +i.platform +i.popen +i.port +i.ports +i.posit +i.post +i.present +i.print +i.processor +i.profil +i.program +i.progress +i.putty +i.query +i.randint +i.random +i.range +i.raw +i.rdp +i.read +i.readfil +i.readlin +i.recycl +i.releas +i.remove +i.rename +i.replac +i.res +i.result +i.resume +i.ret +i.return +i.root +i.row +i.rows +i.run +i.runnabl +i.salt +i.saved +i.scan +i.scanner +i.screen +i.script +i.select +i.send +i.server +i.serverfil +i.session +i.set +i.show +i.shutil +i.sid +i.simple +i.sisdir +i.site +i.size +i.sleep +i.socket +i.sourcedir +i.sourcefil +i.split +i.splitext +i.sqlite +i.start +i.startswith +i.stat +i.state +i.stats +i.str +i.strftime +i.string +i.strip +i.subnet +i.subprocess +i.success +i.sum +i.sys +i.system +i.table +i.tablelist +i.test +i.text +i.tgt +i.thread +i.time +i.toast +i.today +i.todaystr +i.tofile +i.tohome +i.tostr +i.true +i.txt +i.type +i.usage +i.user +i.usernam +i.util +i.utils +i.valid +i.value +i.version +i.view +i.visibl +i.walk +i.widget +i.window +i.winreg +i.with +i.word +i.work +i.write +i.zip \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt.gz b/sourced/ml/core/tests/swivel/col_vocab.txt.gz new file mode 100644 index 0000000..73b3620 Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_vocab.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/row_sums.txt b/sourced/ml/core/tests/swivel/row_sums.txt new file mode 100644 index 0000000..44ea5c6 --- /dev/null +++ b/sourced/ml/core/tests/swivel/row_sums.txt @@ -0,0 +1,304 @@ +21 +58 +76 +59 +38 +92 +102 +102 +58 +26 +23 +44 +77 +68 +50 +49 +76 +25 +76 +49 +37 +44 +61 +75 +30 +90 +79 +102 +12 +3 +27 +74 +57 +62 +59 +58 +51 +100 +34 +75 +32 +113 +55 +19 +43 +65 +41 +49 +49 +39 +22 +22 +99 +37 +16 +29 +86 +49 +13 +14 +28 +44 +80 +50 +29 +20 +13 +154 +24 +76 +62 +70 +128 +27 +21 +39 +11 +45 +43 +124 +92 +80 +141 +88 +23 +24 +50 +52 +175 +43 +115 +12 +29 +16 +49 +48 +19 +95 +10 +63 +37 +102 +59 +21 +106 +76 +65 +50 +95 +59 +26 +29 +23 +21 +91 +78 +8 +78 +142 +50 +76 +112 +76 +61 +37 +133 +55 +93 +37 +19 +13 +16 +20 +32 +31 +31 +31 +64 +13 +58 +11 +21 +198 +77 +39 +50 +7 +59 +111 +12 +50 +102 +29 +141 +55 +58 +13 +39 +30 +22 +10 +27 +60 +40 +4 +58 +50 +76 +52 +74 +41 +59 +40 +74 +40 +40 +156 +73 +16 +32 +34 +31 +27 +93 +58 +31 +27 +19 +28 +64 +82 +44 +37 +37 +31 +62 +39 +95 +205 +25 +14 +18 +95 +26 +56 +10 +29 +59 +74 +24 +72 +19 +42 +18 +64 +33 +34 +54 +41 +51 +74 +41 +12 +9 +35 +25 +73 +39 +11 +76 +33 +9 +36 +52 +72 +27 +62 +45 +26 +149 +104 +64 +24 +64 +19 +26 +34 +21 +21 +22 +22 +97 +15 +61 +70 +27 +22 +85 +20 +107 +100 +104 +78 +17 +63 +40 +11 +141 +27 +30 +24 +78 +167 +32 +19 +89 +59 +46 +22 +6 +55 +50 +79 +5 +38 +12 +50 +97 +78 +29 +55 +72 +17 +95 +76 +59 +76 +61 +9 +45 +26 +32 +107 +19 \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/row_sums.txt.gz b/sourced/ml/core/tests/swivel/row_sums.txt.gz new file mode 100644 index 0000000..5680a5a Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_sums.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt b/sourced/ml/core/tests/swivel/row_vocab.txt new file mode 100644 index 0000000..78d2478 --- /dev/null +++ b/sourced/ml/core/tests/swivel/row_vocab.txt @@ -0,0 +1,304 @@ +i.access +i.action +i.activ +i.adapt +i.add +i.android +i.antonioleiva +i.app +i.append +i.appgroup +i.args +i.argument +i.argv +i.array +i.arrays +i.aslist +i.bar +i.build +i.bundle +i.button +i.call +i.captur +i.chdir +i.check +i.clear +i.click +i.close +i.com +i.command +i.communic +i.compil +i.confdir +i.conffil +i.conffilenam +i.config +i.conn +i.connect +i.content +i.copytre +i.count +i.counter +i.create +i.credenti +i.crypt +i.cursor +i.data +i.date +i.datetim +i.dbfile +i.delay +i.destdir +i.destfil +i.destroy +i.dev +i.develop +i.dict +i.dir +i.directori +i.dirnam +i.dirpath +i.dirs +i.docs +i.dropbox +i.edit +i.empty +i.endswith +i.env +i.error +i.euroclear +i.except +i.execut +i.exists +i.exit +i.expandus +i.ext +i.extens +i.factori +i.false +i.fetchon +i.file +i.filenam +i.files +i.find +i.finish +i.float +i.fname +i.for +i.format +i.get +i.getcwd +i.getenv +i.gethostbynam +i.getsiz +i.github +i.gone +i.handler +i.header +i.hide +i.hkey +i.home +i.host +i.impl +i.inflat +i.info +i.input +i.instanc +i.int +i.intent +i.interactor +i.invis +i.isdir +i.isempti +i.isfile +i.isoformat +i.item +i.items +i.iter +i.java +i.join +i.key +i.layout +i.len +i.length +i.line +i.linux +i.list +i.listdir +i.listen +i.listfil +i.lite +i.ljust +i.local +i.localtim +i.log +i.logdir +i.logfil +i.logfilenam +i.login +i.logsdir +i.long +i.lower +i.machin +i.main +i.make +i.makedir +i.master +i.math +i.menu +i.messag +i.min +i.move +i.mvpexampl +i.myping +i.name +i.navig +i.new +i.newenv +i.newfil +i.newscript +i.nmap +i.nmscan +i.node +i.none +i.number +i.object +i.old +i.onclick +i.oncreat +i.onfinish +i.onitem +i.onlogin +i.onopt +i.onpassword +i.onresum +i.onsuccess +i.onusernam +i.open +i.option +i.optpars +i.ospath +i.output +i.outputdir +i.outputfil +i.overrid +i.parent +i.parse +i.parser +i.pass +i.passwd +i.password +i.path +i.platform +i.popen +i.port +i.ports +i.posit +i.post +i.present +i.print +i.processor +i.profil +i.program +i.progress +i.putty +i.query +i.randint +i.random +i.range +i.raw +i.rdp +i.read +i.readfil +i.readlin +i.recycl +i.releas +i.remove +i.rename +i.replac +i.res +i.result +i.resume +i.ret +i.return +i.root +i.row +i.rows +i.run +i.runnabl +i.salt +i.saved +i.scan +i.scanner +i.screen +i.script +i.select +i.send +i.server +i.serverfil +i.session +i.set +i.show +i.shutil +i.sid +i.simple +i.sisdir +i.site +i.size +i.sleep +i.socket +i.sourcedir +i.sourcefil +i.split +i.splitext +i.sqlite +i.start +i.startswith +i.stat +i.state +i.stats +i.str +i.strftime +i.string +i.strip +i.subnet +i.subprocess +i.success +i.sum +i.sys +i.system +i.table +i.tablelist +i.test +i.text +i.tgt +i.thread +i.time +i.toast +i.today +i.todaystr +i.tofile +i.tohome +i.tostr +i.true +i.txt +i.type +i.usage +i.user +i.usernam +i.util +i.utils +i.valid +i.value +i.version +i.view +i.visibl +i.walk +i.widget +i.window +i.winreg +i.with +i.word +i.work +i.write +i.zip \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt.gz b/sourced/ml/core/tests/swivel/row_vocab.txt.gz new file mode 100644 index 0000000..5dd2fff Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_vocab.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/shard-000-000.pb.gz b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz new file mode 100644 index 0000000..7f2bf35 Binary files /dev/null and b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz differ diff --git a/sourced/ml/core/tests/test_bblfsh_utils.py b/sourced/ml/core/tests/test_bblfsh_utils.py new file mode 100644 index 0000000..1e6f6c5 --- /dev/null +++ b/sourced/ml/core/tests/test_bblfsh_utils.py @@ -0,0 +1,85 @@ +import errno +import os +import random +import socket +import time +import unittest + +import docker.client + +from sourced.ml.core.utils.bblfsh import BBLFSH_VERSION_HIGH, BBLFSH_VERSION_LOW, check_version + + +@unittest.skipIf(os.getenv("SKIP_BBLFSH_UTILS_TESTS", False), "Skip ml_core.utils.bblfsh tests.") +class BblfshUtilsTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.docker_client = docker.from_env() + # ensure docker is running + try: + cls.docker_client.containers.list() + except Exception: + raise Exception("docker not running properly") + cls.er_msg = "supported bblfshd versions: " \ + ">=%s,<%s" % (BBLFSH_VERSION_LOW, BBLFSH_VERSION_HIGH) + + def __check_bblfsh_version_support(self, version: str) -> bool: + """ + :param version: version of bblfshd to check + :return: True if version is supported, False otherwise + """ + with socket.socket() as s: + for _ in range(3): + try: + port = random.randint(10000, 50000) + s.connect(("localhost", port)) + except socket.error as e: + if e.errno == errno.ECONNREFUSED: + break + + container = self.docker_client.containers.run( + image="bblfsh/bblfshd:%s" % version, + privileged=True, + detach=True, + ports={"9432": port}, + ) + + assert container is not None, "failed to create bblfsh container" + + for _ in range(10): + try: + res = check_version(port=port) + break + except Exception: + time.sleep(.1) + pass + + container.stop() + container.remove() + return res + + def test_v200(self): + self.assertFalse(self.__check_bblfsh_version_support("v2.0.0"), self.er_msg) + + def test_v210(self): + self.assertFalse(self.__check_bblfsh_version_support("v2.1.0"), self.er_msg) + + def test_v220(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.2.0"), self.er_msg) + + def test_v230(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.3.0"), self.er_msg) + + def test_v240(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.4.0"), self.er_msg) + + def test_v250(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.5.0"), self.er_msg) + + @classmethod + def tearDownClass(cls): + cls.docker_client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_bigartm.py b/sourced/ml/core/tests/test_bigartm.py new file mode 100644 index 0000000..1228c13 --- /dev/null +++ b/sourced/ml/core/tests/test_bigartm.py @@ -0,0 +1,29 @@ +import argparse +import os +import subprocess +import tempfile +import unittest + +from sourced.ml.core.utils import install_bigartm + + +class BigartmTests(unittest.TestCase): + gitdir = os.path.join(os.path.dirname(__file__), "..", "..") + + @unittest.skipUnless(os.getenv("FULL_TEST", False), "Need to define FULL_TEST env var.") + def test_install_bigartm(self): + with tempfile.TemporaryDirectory() as tmpdir: + args = argparse.Namespace(output=tmpdir, tmpdir=None) + self.assertIsNone(install_bigartm(args)) + self._valivate_bigartm(tmpdir) + + def _valivate_bigartm(self, tmpdir): + bigartm = os.path.join(tmpdir, "bigartm") + self.assertTrue(os.path.isfile(bigartm)) + self.assertEqual(os.stat(bigartm).st_mode & 0o777, 0o777) + output = subprocess.check_output([bigartm], stderr=subprocess.STDOUT) + self.assertIn("BigARTM v", output.decode()) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_bow.py b/sourced/ml/core/tests/test_bow.py new file mode 100644 index 0000000..4d054d0 --- /dev/null +++ b/sourced/ml/core/tests/test_bow.py @@ -0,0 +1,44 @@ +from io import BytesIO +import unittest + +import numpy + +from sourced.ml.core.models import BOW +import sourced.ml.core.tests.models as paths + + +class BOWTests(unittest.TestCase): + def setUp(self): + self.model = BOW().load(source=paths.BOW) + + def test_getitem(self): + repo_name, indices, weights = self.model[0] + self.assertEqual(repo_name, "repo1") + self.assertIsInstance(indices, numpy.ndarray) + self.assertIsInstance(weights, numpy.ndarray) + self.assertEqual(indices.shape, weights.shape) + self.assertEqual(indices.shape, (3,)) + + def test_iter(self): + pumped = list(self.model) + self.assertEqual(len(pumped), 5) + self.assertEqual(pumped, list(range(5))) + + def test_len(self): + self.assertEqual(len(self.model), 5) + + def test_tokens(self): + self.assertEqual(self.model.tokens[0], "i.") + + def test_write(self): + buffer = BytesIO() + self.model.save(output=buffer, series="bow-docfreq") + buffer.seek(0) + new_model = BOW().load(buffer) + self.assertEqual((self.model.matrix != new_model.matrix).nnz, 0) + self.assertEqual(self.model.documents, new_model.documents) + self.assertEqual(self.model.tokens, new_model.tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_coocc.py b/sourced/ml/core/tests/test_coocc.py new file mode 100644 index 0000000..d21a21b --- /dev/null +++ b/sourced/ml/core/tests/test_coocc.py @@ -0,0 +1,28 @@ +import unittest + +from sourced.ml.core.models import Cooccurrences +import sourced.ml.core.tests.models as paths + + +class CooccurrencesTests(unittest.TestCase): + def setUp(self): + self.model = Cooccurrences().load(source=paths.COOCC) + + def test_tokens(self): + tokens = self.model.tokens + self.assertIsInstance(tokens, list) + self.assertEqual(tokens[:10], ["i.set", "i.iter", "i.error", "i.logsdir", "i.read", + "i.captur", "i.clear", "i.android", "i.tohome", "i.ljust"]) + self.assertEqual(len(tokens), 304) + + def test_matrix(self): + matrix = self.model.matrix + self.assertEqual(matrix.shape, (304, 304)) + self.assertEqual(matrix.getnnz(), 16001) + + def test_len(self): + self.assertEqual(len(self.model), 304) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_df.py b/sourced/ml/core/tests/test_df.py new file mode 100644 index 0000000..b7b2872 --- /dev/null +++ b/sourced/ml/core/tests/test_df.py @@ -0,0 +1,101 @@ +from io import BytesIO +import unittest + +from sourced.ml.core.models import DocumentFrequencies +import sourced.ml.core.tests.models as paths + + +class DocumentFrequenciesTests(unittest.TestCase): + def setUp(self): + self.model = DocumentFrequencies().load(source=paths.DOCFREQ) + + def test_docs(self): + docs = self.model.docs + self.assertIsInstance(docs, int) + self.assertEqual(docs, 1000) + + def test_get(self): + self.assertEqual(self.model["aaaaaaa"], 341) + with self.assertRaises(KeyError): + print(self.model["xaaaaaa"]) + self.assertEqual(self.model.get("aaaaaaa", 0), 341) + self.assertEqual(self.model.get("xaaaaaa", 100500), 100500) + + def test_tokens(self): + self.assertEqual(list(self.model._df), self.model.tokens()) + + def test_len(self): + # the remaining 18 are not unique - the model was generated badly + self.assertEqual(len(self.model), 982) + + def test_iter(self): + aaa = False + for tok, freq in self.model: + if "aaaaaaa" in tok: + aaa = True + int(freq) + break + self.assertTrue(aaa) + + def test_prune(self): + pruned = self.model.prune(4) + for _, freq in pruned: + self.assertGreaterEqual(freq, 4) + self.assertEqual(len(pruned), 346) + + def test_prune_self(self): + pruned = self.model.prune(1) + self.assertIs(self.model, pruned) + + def test_greatest(self): + pruned = self.model.greatest(100) + freqs = [v for v in self.model._df.values()] + freqs.sort(reverse=True) + border = freqs[100] + for v in pruned._df.values(): + self.assertGreaterEqual(v, border) + df1 = pruned._df + df2 = self.model.greatest(100)._df + self.assertEqual(df1, df2) + + def test_greatest2(self): + df = DocumentFrequencies().construct(100, {str(x): x for x in range(1000)}) + df_greatest_true = {str(x): x for x in range(500, 1000)} + df_greatest = df.greatest(500) + self.assertEqual(df_greatest._df, df_greatest_true) + + df._df["500a"] = 500 + df._df["500b"] = 500 + df._df["500c"] = 500 + df._df["500d"] = 500 + df._df["500e"] = 500 + + df_greatest = df.greatest(500) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["500a"] = 500 + df_greatest = df.greatest(501) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["500b"] = 500 + df_greatest_true["500c"] = 500 + df_greatest_true["500d"] = 500 + df_greatest_true["500e"] = 500 + df_greatest = df.greatest(505) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["499"] = 499 + df_greatest = df.greatest(506) + self.assertEqual(df_greatest._df, df_greatest_true) + + def test_write(self): + buffer = BytesIO() + self.model.save(buffer) + buffer.seek(0) + new_model = DocumentFrequencies().load(buffer) + self.assertEqual(self.model._df, new_model._df) + self.assertEqual(self.model.docs, new_model.docs) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_dump.py b/sourced/ml/core/tests/test_dump.py new file mode 100644 index 0000000..74d2d19 --- /dev/null +++ b/sourced/ml/core/tests/test_dump.py @@ -0,0 +1,131 @@ +import argparse +from contextlib import contextmanager +from io import StringIO +import logging +import os +import shutil +import sys +import tempfile +import unittest + +from modelforge.tools import dump_model + +import sourced.ml.core.tests.models as paths + + +cache_dir = os.path.join(tempfile.gettempdir(), "ml-test-dump") + + +@contextmanager +def captured_output(): + log = StringIO() + log_handler = logging.StreamHandler(log) + logging.getLogger().addHandler(log_handler) + new_out, new_err = StringIO(), StringIO() + old_out, old_err = sys.stdout, sys.stderr + try: + sys.stdout, sys.stderr = new_out, new_err + yield sys.stdout, sys.stderr, log + finally: + sys.stdout, sys.stderr = old_out, old_err + logging.getLogger().removeHandler(log_handler) + + +class DumpTests(unittest.TestCase): + ID2VEC_DUMP = """{'created_at': '2017-06-18 17:37:06', \ +'dependencies': [], \ +'license': 'ODbL-1.0', \ +'model': 'id2vec', \ +'series': 'id2vec-1000', \ +'size': '1.1 MB', \ +'uuid': '92609e70-f79c-46b5-8419-55726e873cfc', \ +'vendor': 'source{d}', \ +'version': [1, 0, 0]} +Shape: (1000, 300) +First 10 words: ['get', 'name', 'type', 'string', 'class', 'set', 'data', 'value', 'self', 'test'] +""" + + DOCFREQ_DUMP = """{'created_at': '2017-08-09 16:49:12', \ +'dependencies': [], \ +'license': 'ODbL-1.0', \ +'model': 'docfreq', \ +'series': 'docfreq-1000', \ +'size': '6.4 kB', \ +'uuid': 'f64bacd4-67fb-4c64-8382-399a8e7db52a', \ +'vendor': 'source{d}', \ +'version': [0, 1, 0]} +Number of words: 982 +""" + "Random 10 words: " + + BOW_DUMP = """{'created_at': '2018-01-18 21:59:59', \ +'dependencies': [{'created_at': datetime.datetime(2018, 1, 18, 21, 59, 48, 828287), \ +'dependencies': [], \ +'model': 'docfreq', \ +'uuid': '2c4fcae7-93a6-496e-9e3a-d6e15d35b812', \ +'version': [1, 0, 0]}], \ +'license': 'ODbL-1.0', \ +'model': 'bow', \ +'parent': '51b4165d-b2c6-442a-93be-0eb35f4cc19a', \ +'series': 'bow-docfreq', \ +'size': '2.5 kB', \ +'uuid': '0d95f342-2c69-459f-9ee7-a1fc7da88d64', \ +'vendor': 'source{d}', \ +'version': [1, 0, 15]} +Shape: (5, 20) +First 10 documents: ['repo1', 'repo2', 'repo3', 'repo4', 'repo5'] +First 10 tokens: ['i.', 'i.*', 'i.Activity', 'i.AdapterView', 'i.ArrayAdapter', 'i.Arrays', 'i.Bundle', 'i.EditText', 'i.Exception', 'i.False']\n""" # noqa + + COOCC_DUMP = """{'created_at': '2018-01-24 16:00:02', \ +'dependencies': [{'created_at': datetime.datetime(2018, 1, 24, 15, 59, 24, 129470), \ +'dependencies': [], \ +'model': 'docfreq', \ +'uuid': '0f94a6c6-7dc3-4b3c-b8d2-917164a50581', \ +'version': [1, 0, 0]}], \ +'license': 'ODbL-1.0', \ +'model': 'co-occurrences', \ +'series': 'coocc-docfreq', \ +'size': '79.9 kB', \ +'uuid': 'e75dcb2d-ec1d-476b-a04b-bc64c7779ae1', \ +'vendor': 'source{d}', \ +'version': [1, 0, 0]} +Number of words: 304 +First 10 words: ['i.set', 'i.iter', 'i.error', 'i.logsdir', 'i.read', 'i.captur', 'i.clear',""" + \ + """ 'i.android', 'i.tohome', 'i.ljust'] +Matrix: shape: (304, 304) non-zero: 16001 +""" + + def tearDown(self): + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + + def test_id2vec(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.ID2VEC)) + self.assertEqual(out.getvalue(), self.ID2VEC_DUMP) + + def test_docfreq(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.DOCFREQ)) + self.assertEqual(out.getvalue()[:len(self.DOCFREQ_DUMP)], self.DOCFREQ_DUMP) + ending = "\nNumber of documents: 1000\n" + self.assertEqual(out.getvalue()[-len(ending):], ending) + + def test_bow(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.BOW)) + self.assertEqual(out.getvalue(), self.BOW_DUMP) + + def test_coocc(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.COOCC)) + self.assertEqual(out.getvalue(), self.COOCC_DUMP) + + @staticmethod + def _get_args(input): + return argparse.Namespace(input=input, backend=None, args=None, username="", + password="", index_repo="https://github.com/src-d/models", + cache=cache_dir, signoff=False, log_level="WARNING") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_id_splitter_features.py b/sourced/ml/core/tests/test_id_splitter_features.py new file mode 100644 index 0000000..6334d24 --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_features.py @@ -0,0 +1,145 @@ +import io +import tarfile +import tempfile +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow +from sourced.ml.core.tests.models import IDENTIFIERS + + +def write_fake_identifiers(tar_file, n_lines, char_sizes, n_cols, text="a"): + """ + Prepare file with fake identifiers. + :param tar_file: ready to write file. + :param n_lines: number of lines to generate. + :param char_sizes: sizes of identifiers. + :param n_cols: number of columns. + :param text: text that is used to fill identifiers. + """ + # sanity check + if isinstance(char_sizes, int): + char_sizes = [char_sizes] * n_lines + assert len(char_sizes) == n_lines + + # generate file + res = [] + for sz in char_sizes: + line = ",".join([text * sz] * n_cols) + res.append(line) + content = "\n".join(res) + content = content.encode("utf-8") + + # add content to file + info = tarfile.TarInfo("identifiers.txt") + info.size = len(content) + tar_file.addfile(info, io.BytesIO(content)) + + +class IdSplitterTest(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_features(self): + from sourced.ml.core.algorithms.id_splitter.features import prepare_features + # check feature extraction + text = "a a" + n_lines = 10 + max_identifier_len = 20 + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text) + feat = prepare_features(csv_path=tmp.name, use_header=True, identifier_col=0, + max_identifier_len=max_identifier_len, split_identifier_col=1, + shuffle=True, test_ratio=0.5, padding="post") + x_train, x_test, y_train, y_test = feat + # because of test_ratio=0.5 - shapes should be equal + self.assertEqual(x_test.shape, x_train.shape) + self.assertEqual(y_test.shape, y_train.shape) + # each line contains only one split -> so it should be only 5 nonzero for train/test + self.assertEqual(numpy.sum(y_test), 5) + self.assertEqual(numpy.sum(y_train), 5) + # each line contains only two chars -> so it should be only 10 nonzero for train/test + self.assertEqual(numpy.count_nonzero(x_test), 10) + self.assertEqual(numpy.count_nonzero(x_train), 10) + # y should be 3 dimensional matrix + self.assertEqual(y_test.ndim, 3) + self.assertEqual(y_train.ndim, 3) + # x should be 2 dimensional matrix + self.assertEqual(x_test.ndim, 2) + self.assertEqual(x_train.ndim, 2) + # check number of samples + self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines) + self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines) + # check max_identifier_len + self.assertEqual(x_test.shape[1], max_identifier_len) + self.assertEqual(x_train.shape[1], max_identifier_len) + self.assertEqual(y_test.shape[1], max_identifier_len) + self.assertEqual(y_train.shape[1], max_identifier_len) + + # normal file + try: + prepare_features(csv_path=IDENTIFIERS, use_header=True, identifier_col=0, + max_identifier_len=max_identifier_len, split_identifier_col=1, + shuffle=True, test_ratio=0.5, padding="post") + except Exception as e: + self.fail("prepare_features raised %s with log %s" % (type(e), str(e))) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_read_identifiers(self): + from sourced.ml.core.algorithms.id_splitter.features import read_identifiers + # read with header + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, + identifier_col=3, split_identifier_col=4) + self.assertEqual(len(res), 10) + + # read without header + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers(csv_path=tmp.name, use_header=False, max_identifier_len=10, + identifier_col=3, split_identifier_col=4) + self.assertEqual(len(res), 9) + + # read with max_identifier_len equal to 0 -> expect empty list + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=0, + identifier_col=3, split_identifier_col=4) + self.assertEqual(len(res), 0) + + # generate temporary file with identifiers of specific lengths and filter by length + char_sizes = list(range(1, 11)) + + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5) + + # check filtering + # read last two columns as identifiers + for i in range(11): + res = read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=i, + identifier_col=3, split_identifier_col=4) + self.assertEqual(len(res), i) + + # read wrong columns + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2) + + with self.assertRaises(IndexError): + read_identifiers(csv_path=tmp.name, use_header=True, max_identifier_len=10, + identifier_col=3, split_identifier_col=4) + + # normal file + try: + read_identifiers(csv_path=IDENTIFIERS, use_header=True, max_identifier_len=10, + identifier_col=3, split_identifier_col=4) + except Exception as e: + self.fail("read_identifiers raised %s with log %s" % (type(e), str(e))) diff --git a/sourced/ml/core/tests/test_id_splitter_nn_model.py b/sourced/ml/core/tests/test_id_splitter_nn_model.py new file mode 100644 index 0000000..d98cfba --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_nn_model.py @@ -0,0 +1,56 @@ +import string +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow + + +class MetricsTests(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_register_metric(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS + fake_metric = "fake metric" + register_metric(fake_metric) + self.assertIn(fake_metric, METRICS) + METRICS.pop() + self.assertNotIn(fake_metric, METRICS) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_raise_register_metric(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS + bad_metric = 1 + with self.assertRaises(AssertionError): + register_metric(bad_metric) + self.assertNotIn(bad_metric, METRICS) + + +class ModelsTests(unittest.TestCase): + def setUp(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import build_rnn, build_cnn + self.n_uniq = len(string.ascii_lowercase) + self.model_rnn = build_rnn(maxlen=5, units=24, stack=2, rnn_layer="LSTM", + optimizer="Adam", dev0="/cpu:0", dev1="/cpu:0") + self.model_cnn = build_cnn(maxlen=5, filters=[64, 32, 16, 8], output_n_filters=32, + stack=2, kernel_sizes=[2, 4, 8, 16], optimizer="Adam", + device="/cpu:0") + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_rnn(self): + self.assertTrue(self.model_rnn.built) + self.assertTrue(self.model_rnn.trainable) + self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray) + self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1)) + self.assertTrue(self.model_rnn.uses_learning_phase) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_cnn(self): + self.assertTrue(self.model_cnn.built) + self.assertTrue(self.model_cnn.trainable) + self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray) + self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq+1, self.n_uniq+1)) + self.assertTrue(self.model_cnn.uses_learning_phase) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_id_splitter_pipeline.py b/sourced/ml/core/tests/test_id_splitter_pipeline.py new file mode 100644 index 0000000..ac8d384 --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_pipeline.py @@ -0,0 +1,126 @@ +import tempfile +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow + + +class IdSplitterPipelineTest(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_binarize(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import binarize + thresholds = [0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99] + n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + + for th, n_p in zip(thresholds, n_pos): + vals = numpy.arange(10) / 10 + res = binarize(vals, th) + self.assertEqual(sum(binarize(vals, th)), n_p) + if th in (0, 0.99): + self.assertEqual(numpy.unique(res).shape[0], 1) + else: + self.assertEqual(numpy.unique(res).shape[0], 2) + + vals = numpy.arange(10) / 10 + old_vals = vals.copy() + for th, n_p in zip(thresholds, n_pos): + res = binarize(vals, th, inplace=False) + self.assertEqual(sum(res), n_p) + self.assertTrue(numpy.array_equal(old_vals, vals)) + if th in (0, 0.99): + self.assertEqual(numpy.unique(res).shape[0], 1) + else: + self.assertEqual(numpy.unique(res).shape[0], 2) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_devices(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import prepare_devices + correct_args = ["1", "0,1", "-1"] + resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")] + for res, arg in zip(resulted_dev, correct_args): + self.assertEqual(res, prepare_devices(arg)) + + bad_args = ["", "1,2,3"] + for arg in bad_args: + with self.assertRaises(ValueError): + prepare_devices(arg) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_schedule(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import build_schedule + start_lr = 10 + end_lr = 1 + n_epochs = 9 + + lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs) + + for i in range(n_epochs): + self.assertEqual(start_lr - i, lr_schedule(epoch=i)) + + with self.assertRaises(AssertionError): + lr_schedule(-1) + with self.assertRaises(AssertionError): + lr_schedule(n_epochs + 1) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_train_generator(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import build_train_generator + batch_size = 3 + # mismatch number of samples + bad_x = numpy.zeros(3) + bad_y = numpy.zeros(4) + with self.assertRaises(AssertionError): + build_train_generator(bad_x, bad_y, batch_size=batch_size) + + # check generator with correct inputs + x = numpy.zeros(5) + gen = build_train_generator(x, x, batch_size=batch_size) + expected_n_samples = [3, 2] + for n_samples in expected_n_samples: + x_gen, y_gen = next(gen) + self.assertEqual(x_gen.shape, y_gen.shape) + self.assertEqual(n_samples, x_gen.shape[0]) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_train_parameters(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import create_generator_params + batch_size = 500 + samples_per_epoch = 10 ** 6 + n_samples = 40 * 10 ** 6 + epochs = 10 + + steps_per_epoch_ = samples_per_epoch // batch_size + n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch) + + steps_per_epoch, n_epochs = create_generator_params(batch_size, samples_per_epoch, + n_samples, epochs) + self.assertEqual(steps_per_epoch, steps_per_epoch_) + self.assertEqual(n_epochs, n_epochs_) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_config_keras(self): + from keras.backend.tensorflow_backend import get_session + from sourced.ml.core.algorithms.id_splitter.pipeline import config_keras + config_keras() + sess = get_session() + self.assertTrue(sess._config.gpu_options.allow_growth) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_callbacks(self): + from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint + from sourced.ml.core.algorithms.id_splitter.pipeline import prepare_callbacks + with tempfile.TemporaryDirectory() as tmpdir: + callbacks = prepare_callbacks(tmpdir) + + # TensorBoard + self.assertIsInstance(callbacks[0], TensorBoard) + self.assertTrue(callbacks[0].log_dir.startswith(tmpdir)) + + # CSVLogger + self.assertIsInstance(callbacks[1], CSVLogger) + self.assertTrue(callbacks[1].filename.startswith(tmpdir)) + + # ModelCheckpoint + self.assertIsInstance(callbacks[2], ModelCheckpoint) + self.assertTrue(callbacks[2].filepath.startswith(tmpdir)) diff --git a/sourced/ml/core/tests/test_inttypes_to_nodes.py b/sourced/ml/core/tests/test_inttypes_to_nodes.py new file mode 100644 index 0000000..998a7fd --- /dev/null +++ b/sourced/ml/core/tests/test_inttypes_to_nodes.py @@ -0,0 +1,40 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2QuantizedChildren +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2NodesBagTest(unittest.TestCase): + def setUp(self): + self.nodes_bag_extractor = Uast2QuantizedChildren(npartitions=3) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.nodes_bag_extractor(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + def test_quantize_1(self): + freqs = {1: 100, 2: 90, 3: 10, 5: 10, 6: 5, 7: 5} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 3, 7]) + + def test_quantize_2(self): + freqs = {1: 10, 2: 10, 3: 10, 5: 10, 6: 10, 7: 10} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 3, 6, 7]) + + def test_quantize_3(self): + freqs = {1: 100, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 7, 7]) + + def test_quantize_4(self): + freqs = {1: 10, 2: 15, 3: 5, 5: 15, 6: 10, 7: 10} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 5, 7]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_merge_bow.py b/sourced/ml/core/tests/test_merge_bow.py new file mode 100644 index 0000000..d59d73c --- /dev/null +++ b/sourced/ml/core/tests/test_merge_bow.py @@ -0,0 +1,87 @@ +import os +import tempfile +import unittest + +import numpy +from scipy.sparse import csc_matrix + +from sourced.ml.core.models import BOW +from sourced.ml.core.models.model_converters.merge_bow import MergeBOW + + +class MergeBOWTests(unittest.TestCase): + def setUp(self): + self.model1 = BOW() \ + .construct(["doc_1", "doc_2", "doc_3"], ["f.tok_1", "k.tok_2", "f.tok_3"], + csc_matrix((numpy.array([1, 2]), + (numpy.array([0, 1]), numpy.array([1, 0]))), + shape=(3, 3))) + self.model1._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]} + self.model2 = BOW() \ + .construct(["doc_4", "doc_5", "doc_6"], ["f.tok_1", "k.tok_2", "f.tok_3"], + csc_matrix((numpy.array([3, 4]), + (numpy.array([0, 1]), numpy.array([1, 0]))), + shape=(3, 3))) + self.model2._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]} + self.merge_results = [[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 3, 0], [4, 0, 0], [0, 0, 0]] + self.merge_bow = MergeBOW() + + def test_convert_model_base(self): + self.merge_bow.convert_model(self.model1) + self.assertListEqual(self.merge_bow.documents, ["doc_1", "doc_2", "doc_3"]) + self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, row in enumerate(self.merge_bow.matrix[0].toarray()): + self.assertListEqual(list(row), self.merge_results[i]) + self.assertEqual(self.merge_bow.deps, [{"uuid": "uuid", "model": "docfreq"}]) + self.merge_bow.convert_model(self.model2) + self.assertListEqual(self.merge_bow.documents, + ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"]) + self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, arr in enumerate(self.merge_bow.matrix): + for j, row in enumerate(arr.toarray()): + self.assertListEqual(list(row), self.merge_results[i * 3 + j]) + self.assertEqual(self.merge_bow.deps, [{"model": "docfreq", "uuid": "uuid"}]) + + def test_convert_model_error(self): + self.merge_bow.convert_model(self.model1) + self.model2._tokens = ["f.tok_1", "k.tok_2"] + with self.assertRaises(ValueError): + self.merge_bow.convert_model(self.model2) + self.model2._tokens = ["f.tok_1", "k.tok_2", "f.tok_3", "f.tok_4"] + with self.assertRaises(ValueError): + self.merge_bow.convert_model(self.model2) + + def test_finalize_base(self): + self.merge_bow.convert_model(self.model1) + self.merge_bow.convert_model(self.model2) + with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir: + dest = os.path.join(tmpdir, "bow.asdf") + self.merge_bow.finalize(0, dest) + bow = BOW().load(dest) + self.assertListEqual(bow.documents, + ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"]) + self.assertListEqual(bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, row in enumerate(bow.matrix.toarray()): + self.assertListEqual(list(row), self.merge_results[i]) + self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}]) + + def test_finalize_reduce(self): + self.merge_bow.convert_model(self.model1) + self.merge_bow.features_namespaces = "f." + with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir: + dest = os.path.join(tmpdir, "bow.asdf") + self.merge_bow.finalize(0, dest) + bow = BOW().load(dest) + self.assertListEqual(bow.documents, ["doc_1", "doc_2", "doc_3"]) + self.assertListEqual(bow.tokens, ["f.tok_1", "f.tok_3"]) + for i, row in enumerate(bow.matrix.toarray()): + self.assertListEqual(list(row), self.merge_results[i][::2]) + self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}]) + + def test_save_path(self): + self.assertEqual(self.merge_bow._save_path(0, "bow.asdf"), "bow.asdf") + self.assertEqual(self.merge_bow._save_path(0, "bow"), os.path.join("bow", "bow_0.asdf")) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_merge_df.py b/sourced/ml/core/tests/test_merge_df.py new file mode 100644 index 0000000..c2c536a --- /dev/null +++ b/sourced/ml/core/tests/test_merge_df.py @@ -0,0 +1,40 @@ +import os +import tempfile +import unittest + +from sourced.ml.core.models import DocumentFrequencies +from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq + + +class Model2BaseTests(unittest.TestCase): + def setUp(self): + self.model1 = DocumentFrequencies().construct(3, {"one": 1, "two": 2, "three": 3}) + self.model2 = DocumentFrequencies().construct(3, {"four": 4, "three": 3, "five": 5}) + self.merge_df = MergeDocFreq(min_docfreq=1, vocabulary_size=100) + self.merge_result = {"one": 1, "two": 2, "three": 6, "four": 4, "five": 5} + + def test_convert_model(self): + self.merge_df.convert_model(self.model1) + self.assertEqual(self.merge_df._docs, 3) + self.assertEqual(self.merge_df._df, self.model1._df) + self.merge_df.convert_model(self.model2) + self.assertEqual(self.merge_df._docs, 6) + self.assertEqual(self.merge_df._df, self.merge_result) + + def test_finalize(self): + self.merge_df.convert_model(self.model1) + self.merge_df.convert_model(self.model2) + with tempfile.TemporaryDirectory(prefix="merge-df-") as tmpdir: + dest = os.path.join(tmpdir, "df.asdf") + self.merge_df.finalize(0, dest) + df = DocumentFrequencies().load(dest) + self.assertEqual(df.docs, 6) + self.assertEqual(df._df, self.merge_result) + + def test_save_path(self): + self.assertEqual(self.merge_df._save_path(0, "df.asdf"), "df.asdf") + self.assertEqual(self.merge_df._save_path(0, "df"), os.path.join("df", "docfreq_0.asdf")) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_model2base.py b/sourced/ml/core/tests/test_model2base.py new file mode 100644 index 0000000..97c6625 --- /dev/null +++ b/sourced/ml/core/tests/test_model2base.py @@ -0,0 +1,99 @@ +import os +import tempfile +import unittest + +from sourced.ml.core.models.model_converters.base import Model2Base + + +class FromModel: + NAME = "from" + meta = {"dependencies": ()} + + def __init__(self, **kwargs): + pass + + def load(self, source): + pass + + +class ToModel: + NAME = "to" + output = None + meta = {"dependencies": ()} + + def __init__(self, **kwargs): + pass + + def save(self, output, deps=None): + ToModel.output = output + + +class Model2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + finalized = False + + def convert_model(self, model): + return ToModel() + + +class MockingModel2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + finalized = False + + def convert_model(self, model): + return ToModel() + + def finalize(self, index: int, destdir: str): + self.finalized = True + + +class RaisingModel2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + + def convert_model(self, model): + raise ValueError("happens") + + +class FakeQueue: + def __init__(self, contents: list): + self.contents = contents + + def get(self): + return self.contents.pop() + + def put(self, item): + self.contents.append(item) + + +class Model2BaseTests(unittest.TestCase): + def test_convert(self): + converter = Model2Test(num_processes=2) + with tempfile.TemporaryDirectory() as tmpdir: + status = converter.convert(os.listdir(os.path.dirname(__file__)), tmpdir) + self.assertGreater(status, 20) + + def test_process_entry(self): + converter = MockingModel2Test(num_processes=2) + queue_in = FakeQueue([None, "srcdir/job"]) + queue_out = FakeQueue([]) + with tempfile.TemporaryDirectory(prefix="sourced-ml-") as tmpdir: + converter._process_entry( + 0, os.path.join(tmpdir, "destdir"), queue_in, queue_out) + self.assertTrue(os.path.exists(os.path.join(tmpdir, "destdir"))) + self.assertEqual(ToModel.output, os.path.join(tmpdir, "destdir", "job")) + self.assertTrue(converter.finalized) + self.assertEqual(queue_out.contents, [("srcdir/job", True)]) + + def test_process_entry_exception(self): + converter = RaisingModel2Test(num_processes=2) + queue_in = FakeQueue([None, "srcdir/job"]) + queue_out = FakeQueue([]) + converter._process_entry(0, "destdir", queue_in, queue_out) + self.assertEqual(queue_out.contents, [("srcdir/job", False)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_pickleable_logger.py b/sourced/ml/core/tests/test_pickleable_logger.py new file mode 100644 index 0000000..e52b436 --- /dev/null +++ b/sourced/ml/core/tests/test_pickleable_logger.py @@ -0,0 +1,22 @@ +import logging +import pickle +import unittest + +from sourced.ml.core.utils.pickleable_logger import PickleableLogger + + +class TestLogger(PickleableLogger): + def _get_log_name(self): + return "test" + + +class PickleableLoggerTests(unittest.TestCase): + def test_pickle(self): + logger = TestLogger(log_level=logging.ERROR) + logger = pickle._loads(pickle._dumps(logger)) + self.assertIsInstance(logger._log, logging.Logger) + self.assertEqual(logger._log.level, logging.ERROR) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_projector.py b/sourced/ml/core/tests/test_projector.py new file mode 100644 index 0000000..0c36b60 --- /dev/null +++ b/sourced/ml/core/tests/test_projector.py @@ -0,0 +1,127 @@ +import json +import os +import shutil +import socket +import tempfile +import time +import unittest + +from modelforge import slogging +import requests + +from sourced.ml.core.tests.test_dump import captured_output +from sourced.ml.core.utils.projector import CORSWebServer, present_embeddings, wait, web_server + + +class ProjectorTests(unittest.TestCase): + MAX_ATTEMPTS = 40 + + @classmethod + def setUpClass(cls): + slogging.setup("DEBUG", False) + + def setUp(self): + self.pwd = os.getcwd() + + def tearDown(self): + os.chdir(self.pwd) + + def wait_for_web_server(self): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + result = -1 + attempts = 0 + while result != 0 and attempts < self.MAX_ATTEMPTS: + time.sleep(0.05) + attempts += 1 + result = sock.connect_ex(("0.0.0.0", 8000)) + return attempts, result + + def test_web_server(self): + with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir: + os.chdir(tmpdir) + testfile = "test.txt" + with open(testfile, "w") as fout: + fout.write("The Zen of Python, by Tim Peters") + server = CORSWebServer() + server.start() + + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertEqual(requests.get("http://0.0.0.0:8000/test.txt").text, + "The Zen of Python, by Tim Peters") + finally: + server.stop() + + def test_wait(self): + web_server.start() + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertTrue(web_server.running) + except: # noqa + web_server.stop() + raise + os.environ["PROJECTOR_SERVER_TIME"] = "0" + wait() + self.assertFalse(web_server.running) + web_server.start() + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertTrue(web_server.running) + finally: + web_server.stop() + + def test_present_embeddings(self): + with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir: + tmpdir = os.path.join(tmpdir, "1", "2") + present_embeddings(tmpdir, False, ["one", "two"], + [(str(i), "x") for i in range(5)], + [(i, i) for i in range(5)]) + with open(os.path.join(tmpdir, "id2vec.json")) as fin: + json.load(fin) + with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin: + self.assertEqual(fin.read(), "one\ttwo\n0\tx\n1\tx\n2\tx\n3\tx\n4\tx\n") + with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin: + self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n") + + def test_present_embeddings_run_server(self): + def sweded_which(prog): + return None + + which = shutil.which + shutil.which = sweded_which + browser = os.getenv("BROWSER", "") + os.environ["BROWSER"] = "" + + try: + with tempfile.TemporaryDirectory(prefix="sourced.ml.core-test-") as tmpdir: + with captured_output() as (stdout, _, _): + present_embeddings(tmpdir, True, ["one"], + [str(i) for i in range(5)], + [(i, i) for i in range(5)]) + with open(os.path.join(tmpdir, "id2vec.json")) as fin: + json.load(fin) + with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin: + self.assertEqual(fin.read(), "0\n1\n2\n3\n4\n") + with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin: + self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n") + self.assertIn( + "\thttp://projector.tensorflow.org/?config=http://0.0.0.0:8000/id2vec.json\n", + stdout.getvalue()) + finally: + shutil.which = which + os.environ["BROWSER"] = browser + web_server.stop() + + def test_stop(self): + web_server.stop() # dummy test to avoid partially covered line in CI + self.assertFalse(web_server.running) + web_server.start() + web_server.stop() + self.assertFalse(web_server.running) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_quant.py b/sourced/ml/core/tests/test_quant.py new file mode 100644 index 0000000..f2decf9 --- /dev/null +++ b/sourced/ml/core/tests/test_quant.py @@ -0,0 +1,49 @@ +from io import BytesIO +import unittest + +import numpy + +from sourced.ml.core.models import QuantizationLevels +import sourced.ml.core.tests.models as paths + + +class QuantizationLevelsTests(unittest.TestCase): + def setUp(self): + self.model = QuantizationLevels().load(source=paths.QUANTLEVELS) + + def test_levels(self): + levels = self.model.levels + self.assertIsInstance(levels, dict) + self.assertEqual(len(levels), 1) + self.assertIsInstance(levels["children"], dict) + self.assertEqual(len(levels["children"]), 259) + + def test_len(self): + self.assertEqual(len(self.model), 1) + + def test_write(self): + levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]), + "c": numpy.array([7, 8, 9])}, + "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]), + "e": numpy.array([9, 8, 7])}} + buffer = BytesIO() + QuantizationLevels().construct(levels).save(output=buffer, series="quant") + buffer.seek(0) + model = QuantizationLevels().load(buffer) + levels = model.levels + self.assertEqual(len(levels), 2) + self.assertEqual(len(levels["xxx"]), 3) + self.assertEqual(len(levels["yyy"]), 3) + self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all()) + self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all()) + self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all()) + self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all()) + self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all()) + self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all()) + + def test_dump(self): + self.assertEqual(self.model.dump(), "Schemes: [('children', '259@10')]") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_random_walk.py b/sourced/ml/core/tests/test_random_walk.py new file mode 100644 index 0000000..1827589 --- /dev/null +++ b/sourced/ml/core/tests/test_random_walk.py @@ -0,0 +1,29 @@ +import unittest + +import bblfsh + +from sourced.ml.core.algorithms.uast_ids_to_bag import FakeVocabulary +from sourced.ml.core.algorithms.uast_struct_to_bag import Uast2RandomWalks +from sourced.ml.core.tests import models + + +class RandomWalkTests(unittest.TestCase): + def setUp(self): + self.bblfsh = bblfsh.BblfshClient("localhost:9432") + self.uast = self.bblfsh.parse(models.SOURCE_PY).uast + self.uast2walk = Uast2RandomWalks(p_explore_neighborhood=0.5, + q_leave_neighborhood=0.5, + n_walks=5, + n_steps=19, + node2index=FakeVocabulary(), + seed=42) + + def test_rw(self): + for walk in self.uast2walk(self.uast): + for i in range(len(walk)-1): + self.assertNotEqual(walk[i], walk[i+1], + "Two neighbours nodes should not be the same") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_token_parser.py b/sourced/ml/core/tests/test_token_parser.py new file mode 100644 index 0000000..5dab621 --- /dev/null +++ b/sourced/ml/core/tests/test_token_parser.py @@ -0,0 +1,173 @@ +import pickle +import unittest + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser + + +class TokenParserTests(unittest.TestCase): + def setUp(self): + self.tp = TokenParser(stem_threshold=4, max_token_length=20) + self.tp._single_shot = False + + def test_process_token(self): + self.tp.max_token_length = 100 + + tokens = [ + ("UpperCamelCase", ["upper", "camel", "case"]), + ("camelCase", ["camel", "case"]), + ("FRAPScase", ["frap", "case"]), + ("SQLThing", ["sqlt", "hing"]), + ("_Astra", ["astra"]), + ("CAPS_CONST", ["caps", "const"]), + ("_something_SILLY_", ["someth", "silli"]), + ("blink182", ["blink"]), + ("FooBar100500Bingo", ["foo", "bar", "bingo"]), + ("Man45var", ["man", "var"]), + ("method_name", ["method", "name"]), + ("Method_Name", ["method", "name"]), + ("101dalms", ["dalm"]), + ("101_dalms", ["dalm"]), + ("101_DalmsBug", ["dalm", "bug"]), + ("101_Dalms45Bug7", ["dalm", "bug"]), + ("wdSize", ["wd", "size", "wdsize"]), + ("Glint", ["glint"]), + ("foo_BAR", ["foo", "bar"]), + ("sourced.ml.algorithms.uast_ids_to_bag", + ["sourc", "sourcedml", "algorithm", "mlalgorithm", + "uast", "ids", "idsto", "bag", "tobag"]), + ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]), + # Another bad example. Parser failed to parse it correctly + ("SmallIdsToFoOo", ["small", "ids", "idsto", "fo", "oo"]), + ("SmallIdFooo", ["small", "smallid", "fooo", "idfooo"]), + ("ONE_M0re_.__badId.example", ["one", "onem", "re", "bad", "rebad", + "badid", "exampl", "idexampl"]), + ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]), + ("a.b.c.d", ["a", "b", "c", "d"]), + ("A.b.Cd.E", ["a", "b", "cd", "e"]), + ("looong_sh_loooong_sh", ["looong", "looongsh", "loooong", "shloooong", "loooongsh"]), + ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]), + ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]) + ] + + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, correct) + + def test_process_token_single_shot(self): + self.tp.max_token_length = 100 + self.tp._single_shot = True + self.tp.min_split_length = 1 + tokens = [ + ("UpperCamelCase", ["upper", "camel", "case"]), + ("camelCase", ["camel", "case"]), + ("FRAPScase", ["frap", "case"]), + ("SQLThing", ["sqlt", "hing"]), + ("_Astra", ["astra"]), + ("CAPS_CONST", ["caps", "const"]), + ("_something_SILLY_", ["someth", "silli"]), + ("blink182", ["blink"]), + ("FooBar100500Bingo", ["foo", "bar", "bingo"]), + ("Man45var", ["man", "var"]), + ("method_name", ["method", "name"]), + ("Method_Name", ["method", "name"]), + ("101dalms", ["dalm"]), + ("101_dalms", ["dalm"]), + ("101_DalmsBug", ["dalm", "bug"]), + ("101_Dalms45Bug7", ["dalm", "bug"]), + ("wdSize", ["wd", "size"]), + ("Glint", ["glint"]), + ("foo_BAR", ["foo", "bar"]), + ("sourced.ml.algorithms.uast_ids_to_bag", + ["sourc", "ml", "algorithm", "uast", "ids", "to", "bag"]), + ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]), + # Another bad example. Parser failed to parse it correctly + ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]), + ("SmallIdFooo", ["small", "id", "fooo"]), + ("ONE_M0re_.__badId.example", ["one", "m", "re", "bad", "id", "exampl"]), + ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]), + ("a.b.c.d", ["a", "b", "c", "d"]), + ("A.b.Cd.E", ["a", "b", "cd", "e"]), + ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]), + ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]), + ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]) + ] + + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, correct) + + min_split_length = 3 + self.tp.min_split_length = min_split_length + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, [c for c in correct if len(c) >= min_split_length]) + + def test_split(self): + self.assertEqual(list(self.tp.split("set for")), ["set", "for"]) + self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"]) + self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"]) + self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"]) + self.assertEqual(list(self.tp.split("PrintAllExcept")), ["print", "all", "except"]) + self.assertEqual( + list(self.tp.split("print really long line")), + # 'longli' is expected artifact due to edge effects + ["print", "really", "long", "longli"]) + self.assertEqual( + list(self.tp.split("set /for. *&PrintAll")), + ["set", "for", "print", "all"]) + self.assertEqual( + list(self.tp.split("JumpDown not Here")), + ["jump", "down", "not", "here"]) + + self.assertEqual( + list(self.tp.split("a b c d")), + ["a", "b", "c", "d"]) + self.assertEqual( + list(self.tp.split("a b long c d")), + ["a", "b", "long", "blong", "longc", "d"]) + self.assertEqual( + list(self.tp.split("AbCd")), + ["ab", "cd"]) + + def test_split_single_shot(self): + self.tp._single_shot = True + self.tp.min_split_length = 1 + self.assertEqual( + list(self.tp.split("print really long line")), + # 'longli' is expected artifact due to edge effects + ["print", "really", "long", "li"]) + self.assertEqual( + list(self.tp.split("a b c d")), + ["a", "b", "c", "d"]) + self.assertEqual( + list(self.tp.split("a b long c d")), + ["a", "b", "long", "c", "d"]) + self.assertEqual( + list(self.tp.split("AbCd")), + ["ab", "cd"]) + + def test_stem(self): + self.assertEqual(self.tp.stem("lol"), "lol") + self.assertEqual(self.tp.stem("apple"), "appl") + self.assertEqual(self.tp.stem("orange"), "orang") + self.assertEqual(self.tp.stem("embedding"), "embed") + self.assertEqual(self.tp.stem("Alfred"), "Alfred") + self.assertEqual(self.tp.stem("Pluto"), "Pluto") + + def test_pickle(self): + tp = pickle.loads(pickle.dumps(self.tp)) + self.assertEqual(tp.stem("embedding"), "embed") + + +class NoopTokenParserTests(unittest.TestCase): + def setUp(self): + self.tp = NoopTokenParser() + + def test_process_token(self): + self.assertEqual(list(self.tp.process_token("abcdef")), ["abcdef"]) + self.assertEqual(list(self.tp.process_token("abcd_ef")), ["abcd_ef"]) + self.assertEqual(list(self.tp.process_token("abcDef")), ["abcDef"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py new file mode 100644 index 0000000..b360683 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py @@ -0,0 +1,20 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2GraphletBag +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2GraphletBagTest(unittest.TestCase): + def setUp(self): + self.graphlets_bag_extractor = Uast2GraphletBag() + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.graphlets_bag_extractor(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_struct_to_bag.py b/sourced/ml/core/tests/test_uast_struct_to_bag.py new file mode 100644 index 0000000..d6590d4 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_struct_to_bag.py @@ -0,0 +1,55 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import UastRandomWalk2Bag, UastSeq2Bag +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2RandomWalk2BagTest(unittest.TestCase): + def setUp(self): + self.uast_random_walk2bag = UastRandomWalk2Bag(seq_len=[2, 3]) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.uast_random_walk2bag(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + def test_equivalence_prepare_starting_nodes(self): + starting_nodes_old = self.prepare_starting_nodes(self.uast) + starting_nodes = self.uast_random_walk2bag.uast2walks.prepare_starting_nodes(self.uast) + self.assertEqual(len(starting_nodes_old), len(starting_nodes)) + + def structure(tree): + from collections import Counter + return set(Counter(len(node.children) for node in tree)) + + self.assertEqual(structure(starting_nodes_old), structure(starting_nodes)) + + def prepare_starting_nodes(self, uast): + starting_nodes = [] + self._prepare_starting_nodes(uast, None, starting_nodes) + + return starting_nodes + + def _prepare_starting_nodes(self, root, parent, starting_nodes): + node = self.uast_random_walk2bag.uast2walks._extract_node(node=root, parent=parent) + starting_nodes.append(node) + + for ch in root.children: + node.children.append(self._prepare_starting_nodes( + ch, parent=node, starting_nodes=starting_nodes)) + + +class UastSeq2BagTest(unittest.TestCase): + def setUp(self): + self.uast_seq2bag = UastSeq2Bag(seq_len=[2, 3]) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.uast_seq2bag(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_id_distance.py b/sourced/ml/core/tests/test_uast_to_id_distance.py new file mode 100644 index 0000000..d3f5d56 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_id_distance.py @@ -0,0 +1,138 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2IdLineDistance, Uast2IdTreeDistance +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2IdTreeDistanceTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2IdTreeDistance(token_parser=NoopTokenParser(), + max_distance=4) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + self.maxDiff = None + + def test_result(self): + correct = [(("__spec__", "ModuleSpec"), 2), + (("__spec__", "ModuleSpec"), 3), + (("__spec__", "ModuleSpec"), 3), + (("collections", "ModuleSpec"), 2), + (("collections", "ModuleSpec"), 2), + (("collections", "ModuleSpec"), 3), + (("collections", "__spec__"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "collections"), 3), + (("namedtuple", "collections"), 3), + (("setup", "modelforge.logs"), 3), + (("setup_logging", "modelforge.logs"), 3), + (("sys", "modelforge.logs"), 3), + (("sys", "modules"), 2), + (("utmain", "ModuleSpec"), 2), + (("utmain", "ModuleSpec"), 3), + (("utmain", "ModuleSpec"), 3), + (("utmain", "__package__"), 2), + (("utmain", "__spec__"), 2), + (("utmain", "__spec__"), 2), + (("utmain", "collections"), 3), + (("utmain", "modelforge.logs"), 2), + (("utmain", "modelforge.logs"), 2), + (("utmain", "setup"), 3), + (("utmain", "setup"), 3), + (("utmain", "setup_logging"), 3), + (("utmain", "setup_logging"), 3), + (("utmain", "sys"), 3), + (("utmain", "sys"), 3)] + + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +class Uast2IdLineDistanceTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2IdLineDistance(token_parser=NoopTokenParser(), + max_distance=3) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + self.maxDiff = None + + def test_result(self): + correct = [(("__package__", "ModuleSpec"), 2), + (("__spec__", "ModuleSpec"), 0), + (("__spec__", "ModuleSpec"), 1), + (("__spec__", "ModuleSpec"), 1), + (("__spec__", "ModuleSpec"), 2), + (("__spec__", "__package__"), 0), + (("collections", "ModuleSpec"), 1), + (("collections", "ModuleSpec"), 2), + (("collections", "__package__"), 1), + (("collections", "__spec__"), 1), + (("collections", "__spec__"), 2), + (("modules", "__package__"), 1), + (("modules", "__spec__"), 1), + (("modules", "collections"), 2), + (("namedtuple", "ModuleSpec"), 0), + (("namedtuple", "ModuleSpec"), 1), + (("namedtuple", "ModuleSpec"), 1), + (("namedtuple", "ModuleSpec"), 2), + (("namedtuple", "ModuleSpec"), 2), + (("namedtuple", "__package__"), 1), + (("namedtuple", "__package__"), 2), + (("namedtuple", "__spec__"), 1), + (("namedtuple", "__spec__"), 1), + (("namedtuple", "__spec__"), 2), + (("namedtuple", "__spec__"), 2), + (("namedtuple", "collections"), 0), + (("namedtuple", "collections"), 1), + (("namedtuple", "modules"), 2), + (("setup_logging", "modelforge.logs"), 0), + (("setup_logging", "setup"), 1), + (("sys", "__package__"), 1), + (("sys", "__spec__"), 1), + (("sys", "collections"), 2), + (("sys", "modelforge.logs"), 2), + (("sys", "modules"), 0), + (("sys", "namedtuple"), 2), + (("sys", "setup_logging"), 2), + (("utmain", "ModuleSpec"), 0), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 2), + (("utmain", "ModuleSpec"), 2), + (("utmain", "ModuleSpec"), 2), + (("utmain", "__package__"), 0), + (("utmain", "__package__"), 0), + (("utmain", "__package__"), 1), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 1), + (("utmain", "__spec__"), 2), + (("utmain", "collections"), 1), + (("utmain", "collections"), 1), + (("utmain", "collections"), 2), + (("utmain", "collections"), 2), + (("utmain", "modules"), 0), + (("utmain", "modules"), 1), + (("utmain", "modules"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 2), + (("utmain", "namedtuple"), 2), + (("utmain", "namedtuple"), 2), + (("utmain", "namedtuple"), 2), + (("utmain", "sys"), 0), + (("utmain", "sys"), 1), + (("utmain", "sys"), 1)] + + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_id_sequence.py b/sourced/ml/core/tests/test_uast_to_id_sequence.py new file mode 100644 index 0000000..ffce913 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_id_sequence.py @@ -0,0 +1,25 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2IdSequence +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2IdSequenceTest(unittest.TestCase): + def setUp(self): + self.uast2id_sequence = Uast2IdSequence(token_parser=NoopTokenParser()) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_result(self): + correct = ["sys", "setup_logging", "modelforge.logs", "utmain", "modules", "sys", + "__package__", "utmain", "__spec__", "utmain", "namedtuple", "collections", + "ModuleSpec", "namedtuple", "__spec__", "utmain", "ModuleSpec", "ModuleSpec", + "utmain", "setup", "setup_logging"] + res = self.uast2id_sequence(self.uast) + self.assertEqual(res, self.uast2id_sequence.concat(correct)) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_role_id_pairs.py b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py new file mode 100644 index 0000000..6ec6b08 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py @@ -0,0 +1,42 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2RoleIdPairs +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2NodesBagTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2RoleIdPairs(token_parser=NoopTokenParser()) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_result(self): + correct = [("ModuleSpec", "BODY | IF | THEN"), + ("ModuleSpec", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("ModuleSpec", "STATEMENT | INCOMPLETE"), + ("__package__", "BINARY | EXPRESSION | CONDITION"), + ("__spec__", "BINARY | EXPRESSION | CONDITION"), + ("__spec__", "BODY | IF | THEN"), + ("collections", "IDENTIFIER | IMPORT | PATHNAME"), + ("modelforge.logs", "IDENTIFIER | IMPORT | PATHNAME"), + ("modules", "RIGHT | EXPRESSION | INCOMPLETE"), + ("namedtuple", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("namedtuple", "IDENTIFIER | IMPORT | PATHNAME"), + ("setup", "IDENTIFIER | DECLARATION | FUNCTION | NAME"), + ("setup_logging", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("setup_logging", "IDENTIFIER | IMPORT | PATHNAME"), + ("sys", "IDENTIFIER | IMPORT | PATHNAME"), + ("sys", "RIGHT | EXPRESSION | INCOMPLETE"), + ("utmain", "BINARY | EXPRESSION | CONDITION"), + ("utmain", "BINARY | EXPRESSION | CONDITION"), + ("utmain", "BODY | IF | THEN"), + ("utmain", "FILE | MODULE"), + ("utmain", "STATEMENT | INCOMPLETE")] + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/utils/__init__.py b/sourced/ml/core/utils/__init__.py new file mode 100644 index 0000000..41754f0 --- /dev/null +++ b/sourced/ml/core/utils/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from sourced.ml.core.utils.bigartm import install_bigartm +from sourced.ml.core.utils.pickleable_logger import PickleableLogger diff --git a/sourced/ml/core/utils/bblfsh.py b/sourced/ml/core/utils/bblfsh.py new file mode 100644 index 0000000..12b111a --- /dev/null +++ b/sourced/ml/core/utils/bblfsh.py @@ -0,0 +1,19 @@ +from distutils.version import StrictVersion + +from bblfsh.client import BblfshClient + +BBLFSH_VERSION_LOW = "2.2" +BBLFSH_VERSION_HIGH = "3.0" + + +def check_version(host: str = "0.0.0.0", port: str = "9432") -> bool: + """ + Check if the bblfsh server version matches module requirements. + + :param host: bblfsh server host + :param port: bblfsh server port + :return: True if bblfsh version specified matches requirements + """ + # get version and remove leading 'v' + version = StrictVersion(BblfshClient("%s:%s" % (host, port)).version().version.lstrip("v")) + return StrictVersion(BBLFSH_VERSION_LOW) <= version < StrictVersion(BBLFSH_VERSION_HIGH) diff --git a/sourced/ml/core/utils/bblfsh_roles.py b/sourced/ml/core/utils/bblfsh_roles.py new file mode 100644 index 0000000..22f8569 --- /dev/null +++ b/sourced/ml/core/utils/bblfsh_roles.py @@ -0,0 +1,14 @@ +import bblfsh + + +IDENTIFIER = bblfsh.role_id("IDENTIFIER") +QUALIFIED = bblfsh.role_id("QUALIFIED") +LITERAL = bblfsh.role_id("LITERAL") +OPERATOR = bblfsh.role_id("OPERATOR") +EXPRESSION = bblfsh.role_id("EXPRESSION") +LEFT = bblfsh.role_id("LEFT") +BINARY = bblfsh.role_id("BINARY") +ASSIGNMENT = bblfsh.role_id("ASSIGNMENT") +FUNCTION = bblfsh.role_id("FUNCTION") +DECLARATION = bblfsh.role_id("DECLARATION") +NAME = bblfsh.role_id("NAME") diff --git a/sourced/ml/core/utils/bigartm.py b/sourced/ml/core/utils/bigartm.py new file mode 100644 index 0000000..26e78e7 --- /dev/null +++ b/sourced/ml/core/utils/bigartm.py @@ -0,0 +1,58 @@ +import glob +import logging +import multiprocessing +import os +import shutil +import subprocess +import tempfile + + +def execute(cmd, cwd, log): + log.info(">>> %s", cmd) + parsed = [v for v in cmd.split(" ") if v] + subprocess.check_call(parsed, cwd=cwd) + + +def install_bigartm(args=None, target="./bigartm", tempdir=None): + """ + Deploys bigartm/bigartm at the specified path. + + :param args: :class:`argparse.Namespace` with "output" and "tmpdir". \ + "output" sets the target directory, "tmpdir" sets \ + the temporary directory which is used to clone bigartm/bigartm \ + and build it. + :param target: The path to the built executable. If args is not None, it \ + becomes overridden. + :param tempdir: The temporary directory where to clone and build \ + bigartm/bigartm. If args is not None, it becomes overridden. + :return: None if successful; otherwise, the error code (can be 0!). + """ + log = logging.getLogger("bigartm") + if args is not None: + tempdir = args.tmpdir + target = os.path.join(args.output, "bigartm") + if shutil.which(os.path.basename(target)) or shutil.which(target, path=os.getcwd()): + log.warning("bigartm is in the PATH, no-op.") + return 0 + if not shutil.which("cmake"): + log.error("You need to install cmake.") + return 1 + parent_dir = os.path.dirname(target) + os.makedirs(parent_dir, exist_ok=True) + if not os.path.isdir(parent_dir): + log.error("%s is not a directory.", parent_dir) + return 2 + with tempfile.TemporaryDirectory(prefix="bigartm-", dir=tempdir) as tmpdir: + log.info("Building bigartm/bigartm in %s...", tmpdir) + execute("git clone --single-branch --depth=1 https://github.com/bigartm/bigartm .", + tmpdir, log) + cwd = os.path.join(tmpdir, "build") + os.mkdir(cwd) + execute("cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DPYTHON=python3 ..", + cwd, log) + execute("make -j%d" % multiprocessing.cpu_count(), cwd, log) + whl_path = glob.glob(os.path.join(tmpdir, "build/python/*.whl"))[0] + execute("pip3 install \"%s\"" % whl_path, cwd, log) + shutil.copyfile(os.path.join(cwd, "bin", "bigartm"), target) + os.chmod(target, 0o777) + log.info("Installed %s", os.path.abspath(target)) diff --git a/sourced/ml/core/utils/pickleable_logger.py b/sourced/ml/core/utils/pickleable_logger.py new file mode 100644 index 0000000..31d5bd0 --- /dev/null +++ b/sourced/ml/core/utils/pickleable_logger.py @@ -0,0 +1,35 @@ +import logging + + +class PickleableLogger: + """ + Base class which provides the logging features through ``self._log``. + + Can be safely pickled. + """ + + def __init__(self, log_level=logging.INFO): + """ + Class constructor + + :param log_level: logging level. + """ + self._log = logging.getLogger(self._get_log_name()) + self._log.setLevel(log_level) + + def __getstate__(self): + state = self.__dict__.copy() + state["_log"] = self._log.level + return state + + def __setstate__(self, state): + self.__dict__.update(state) + log_level = state["_log"] + self._log = logging.getLogger(self._get_log_name()) + self._log.setLevel(log_level) + + def _get_log_name(self): + """ + Children must implement this method. It shall return the logger's name. + """ + raise NotImplementedError diff --git a/sourced/ml/core/utils/projector.py b/sourced/ml/core/utils/projector.py new file mode 100644 index 0000000..77d747b --- /dev/null +++ b/sourced/ml/core/utils/projector.py @@ -0,0 +1,108 @@ +from http.server import HTTPServer, SimpleHTTPRequestHandler, test +import logging +import os +import shutil +import threading +import time + + +class CORSWebServer: + def __init__(self): + self.thread = None + self.server = None + + def serve(self): + outer = self + + class ClojureServer(HTTPServer): + def __init__(self, *args, **kwargs): + HTTPServer.__init__(self, *args, **kwargs) + outer.server = self + + class CORSRequestHandler(SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Access-Control-Allow-Origin", "*") + SimpleHTTPRequestHandler.end_headers(self) + + test(CORSRequestHandler, ClojureServer) + + def start(self): + self.thread = threading.Thread(target=self.serve) + self.thread.start() + + def stop(self): + if self.running: + self.server.shutdown() + self.server.server_close() + self.thread.join() + self.server = None + self.thread = None + + @property + def running(self): + return self.server is not None + + +web_server = CORSWebServer() + + +def present_embeddings(destdir, run_server, labels, index, embeddings): + log = logging.getLogger("projector") + log.info("Writing Tensorflow Projector files...") + if not os.path.isdir(destdir): + os.makedirs(destdir) + os.chdir(destdir) + metaf = "id2vec_meta.tsv" + with open(metaf, "w") as fout: + if len(labels) > 1: + fout.write("\t".join(labels) + "\n") + for item in index: + if len(labels) > 1: + fout.write("\t".join(item) + "\n") + else: + fout.write(item + "\n") + log.info("Wrote %s", metaf) + dataf = "id2vec_data.tsv" + with open(dataf, "w") as fout: + for vec in embeddings: + fout.write("\t".join(str(v) for v in vec)) + fout.write("\n") + log.info("Wrote %s", dataf) + jsonf = "id2vec.json" + with open(jsonf, "w") as fout: + fout.write("""{ + "embeddings": [ + { + "tensorName": "id2vec", + "tensorShape": [%s, %s], + "tensorPath": "http://0.0.0.0:8000/%s", + "metadataPath": "http://0.0.0.0:8000/%s" + } + ] +} +""" % (len(embeddings), len(embeddings[0]), dataf, metaf)) + log.info("Wrote %s", jsonf) + if run_server and not web_server.running: + web_server.start() + url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf + log.info(url) + if run_server: + if shutil.which("xdg-open") is not None: + os.system("xdg-open " + url) + else: + browser = os.getenv("BROWSER", "") + if browser: + os.system(browser + " " + url) + else: + print("\t" + url) + + +def wait(): + log = logging.getLogger("projector") + secs = int(os.getenv("PROJECTOR_SERVER_TIME", "60")) + log.info("Sleeping for %d seconds, safe to Ctrl-C" % secs) + try: + time.sleep(secs) + except KeyboardInterrupt: + pass + web_server.stop()