feature-miner-pub/config.example at main · AMCeScience/feature-miner-pub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os

CWD = os.getcwd()

# Input locations
DB_FILE = CWD + '/Database/miner_database.db'
TEXT_DATA_LOCATION = 'Text_data/'
CLASSIFIER_LOCATION = 'run1'
CLASSIFIER_PART_LOCATION = CLASSIFIER_LOCATION + '/nvsone_parts'

# Output locations
OUTPUT_LOCATION = 'run1'
PLOT_LOCATION = OUTPUT_LOCATION + '/plots'
CORRELATION_LOCATION = OUTPUT_LOCATION + '/correlations'

# Document cleaning toggles
CLEAN_TEST_DATA = False
EXTRA_STOPWORDS = []
KEEP_NUMBERS = False
KEEP_DASHES = False

# Document frequency filters used in the count vectorizer
MIN_DOCFREQ = 2
MAX_DOCFREQ = 0.95

# Number of topicmodels to create
NUM_TOPICMODELS = 5
# Determine the number of reviews in the leave-one-out experiments
# This number determines the folds that need to be run, i.e.:
# total number of reviews / NUM_REVIEWS_TEST_SET
NUM_REVIEWS_TEST_SET = 1
# Limit for the maximum number of folds to run. Folds are calculated
# as described above. This determines the upper limit, -1 = no limit.
MAX_FOLDS = -1
# Limit for the maximum number of reviews to include in the
# experiments. -1 = no limit
MAX_REVIEWS = -1
# Determine the sizes of the training sets to train with.
# SIMILARITY_STEPS = [1, 2, 5, 10, 15, 20, 30, 40]
SIMILARITY_STEPS = [1]

# Number of cores to use when training. -1 = no limit
NUM_JOBS = -1
POOL_PROCESSES = None

# Review groups, grouped by manually assigned ICD-10 codes
# GROUPS = {
#   'tuberculosis': ['CD009593', 'CD010705'],
#   'aspergillosis': ['CD007394', 'CD009135', 'CD009551', 'CD009579'],
#   'cancer': ['CD009944', 'CD011134', 'CD009323', 'CD010409', 'CD009519', 'CD010276', 'CD010173', 'CD009786'],
#   'dementia': ['CD010771', 'CD010632', 'CD008782', 'CD010775', 'CD010653', 'CD010896', 'CD010772', 'CD011145', 'CD010633', 'CD010386', 'CD010783', 'CD010860'],
#   'liver and bile duct': ['CD011549', 'CD011548', 'CD010339', 'CD010542'],
#   'joint pain': ['CD008643', 'CD007427', 'CD008686', 'CD009020', 'CD007431'],
#   'down syndrome': ['CD011984', 'CD011975', 'CD009925'],
#   'singles': ['CD010438', 'CD008054', 'CD009591', 'CD008691', 'CD008803', 'CD009647', 'CD008760', 'CD009372', 'CD012019', 'CD008081', 'CD009185', 'CD010023'],
# }
# Labels used to identify the groups in cosine similarity data
# GROUP_LABELS = [1, 2, 3, 4, 5, 6, 7, 'Other']

GROUPS = {
  'A': ['CD000002', 'CD000003'],
  'singles': ['CD000001']
}

GROUP_LABELS = ['A', 'other']