-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing_worker.js
More file actions
76 lines (63 loc) · 2.91 KB
/
preprocessing_worker.js
File metadata and controls
76 lines (63 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
let stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now'];
let corpus = [];
function removeDuplicates(searchResultsData){
const uniqueItems = searchResultsData.filter((item, index) => {
return index === searchResultsData.findIndex(obj => {
return obj.url === item.url;
});
});
return uniqueItems;
}
function clearText(text) {
return text
.toLowerCase()
.replace(/[^A-Za-zА-Яа-яЁёЇїІіҐґЄє0-9\-]|\s]/g, " ")
.replace(/\s{2,}/g, " ");
}
function removeStopwords(text){
res = []
words = text.split(' ')
for(let i=0;i<words.length;i++) {
word_clean = words[i].split(".").join("")
if(!stopwords.includes(word_clean)) {
res.push(word_clean)
}
}
return(res.join(' '))
}
function tokenizer(text){
return text.split(/\W+/);
}
function create_freqMap(preprocessedResult){
preprocessedResult.map(words =>{
if(!freqMap.has(words)){
freqMap.set(words, 0);
}
freqMap.set(words,freqMap.get(words)+1);
})
return freqMap;
}
function preprocessing(searchResultsData) {
searchResultsData = removeDuplicates(searchResultsData);
searchResultsData.forEach(result =>{
preprocessedResult = clearText(result.snippet);
preprocessedResult = removeStopwords(preprocessedResult);
preprocessedResult = tokenizer(preprocessedResult);
result.preprocessedResults = preprocessedResult;
result.vectors = [];
result.clicks = 0;
corpus.push(preprocessedResult);
freqMap = create_freqMap(preprocessedResult);
})
// console.log("Preprocessed text snippets");
return searchResultsData;
}
self.addEventListener('message', event => {
searchResultsData = event.data[0];
freqMap = event.data[1];
// Perform preprocessing and vectorization
const preprocessedData = preprocessing(searchResultsData);
// const vectorizedData = tf_idfVectorizer(corpus, preprocessedData);
// Send the vectorized data back to the main script
self.postMessage([preprocessedData,corpus,freqMap]);
});