-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTrain.py
More file actions
88 lines (71 loc) · 3.6 KB
/
Train.py
File metadata and controls
88 lines (71 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pickle
from canny_cf import *
# import matplotlib.pyplot as plt
import scipy.sparse as sp
# This function trains the dataset and saves the results
k = 10 # number of latent factors
FNAME = 'data/LasVegas_local.pck'
# FNAME2 should be the tourist dataset
FNAME2 = 'data/LasVegas_tourist.pck' # Only used if MODE = 'combined'
PROPORTION = .2 # Train to test percentage
ITERS = 15 # Number of times to run EM
SAVENAME='data/combined_results.pck' # where to save the results of the simulation
MODE='combined' # mode is either 'combined' or 'regular'.
# mode = 'regular' means that the script loads in a dataset, trains on most of it, and tests on a small portion
# mode = 'combined' should be used when you want to load in local and tourist data. Then it trains on the local data (and some tourist data)
# and tests on the remainder of the tourist data
if MODE == 'regular':
with open(FNAME, 'rb') as pickle_file:
Ycol = pickle.load(pickle_file)
print('Ycol shape: {}'.format(Ycol.shape))
# Split the dataset
Ytest, Ytrain, test_row_ind, test_col_ind, _, _ = split_Y(Ycol, PROPORTION)
Ytotal = Ycol
original_shape = Ytotal.shape # not used for 'regular' case
elif MODE == 'combined':
# Open the first file (local file)
with open(FNAME, 'rb') as pickle_file:
Ycol1 = pickle.load(pickle_file)
print('Ycol shape: {}'.format(Ycol1.shape))
# Open the second file (tourist file)
with open(FNAME2, 'rb') as pickle_file:
Ycol = pickle.load(pickle_file)
print('Ycol shape: {}'.format(Ycol.shape))
# Split dataset 2 (the tourist data) into testing and training
Ytest, Ytrain, test_row_ind, test_col_ind, train_row_ind, train_col_ind = split_Y(Ycol, PROPORTION)
original_shape = Ytrain.shape # used to split the data again when evaluating
# Only take businesses that survived the splitting for Ycol2
Ycol1 = Ycol1[train_row_ind, :]
# Now combine our two training matrices (Ycol1 is enturely used for training)
Ytrain = sp.hstack([Ytrain, Ycol1], format='csc')
Ytotal = sp.hstack([Ycol, Ycol1], format='csc')
print('Ytrain shape: {}'.format(Ytrain.shape))
# Now go through again and make sure Ytrain has no empty rows (precautionary measure).
print('Ytrain before removing empty: {}'.format(Ytrain.shape))
train_row_ind, train_col_ind = np.unique(Ytrain.nonzero()[0]), np.unique(Ytrain.nonzero()[1])
Ytrain = Ytrain[train_row_ind,:]
Ytrain = Ytrain[:, train_col_ind]
Ytotal = Ytotal[:, train_col_ind]
Ytotal = Ytotal[train_row_ind, :]
print('below shouldn\'t change. If it does there may be a problem. See code comments for details')
print('Ytrain after removing empty: {}'.format(Ytrain.shape))
print('Ytotal after removing empty: {}'.format(Ytotal.shape))
# The above really shouldn't really change. If it does we don't know if users were removed from the test set or the training set and it
# might throw off the testing results as we would no longer be comparing the same users anymore.
# Train
lam_diff, psi_diff, train_err, test_err, x, Lam = train(Ytotal, Ytrain, k, ITERS, Ytest, PROPORTION, test_row_ind, test_col_ind, original_shape)
# Get the correct rows and columns of Lambda and x
x_test, Lam_test = split_others(Ytotal[:, :original_shape[1]], x, Lam, PROPORTION, test_row_ind, test_col_ind)
results = dict(
x=x,
Ytest=Ytest,
Ytrain=Ytrain,
Lam=Lam,
x_test=x_test,
Lam_test=Lam_test,
lam_diff=lam_diff,
psi_diff=psi_diff,
train_err=train_err,
test_err=test_err
)
pickle.dump(results, open(SAVENAME, 'wb'))