-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtext_preprocessor.py
More file actions
39 lines (28 loc) · 1.15 KB
/
text_preprocessor.py
File metadata and controls
39 lines (28 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import regex as re
import numpy as np
def details(description:str):
pattern = re.compile(r'^\s*(.+?)\s*:\s*(.+?)\s*$', re.MULTILINE)
matches = pattern.findall(description)
result = {}
seen_values = set()
for key, value in matches:
if value not in seen_values:
result[key] = value
seen_values.add(value)
desc=[result[key] for key in result.keys() if key not in ['value','unit']]
return ". ".join(desc),result.get("Value",""),result.get("Unit","")
train_path="../dataset/train.csv"
train_df=pd.read_csv(train_path)
test_path="../dataset/test.csv"
test_df=pd.read_csv(test_path)
processed_df=train_df['catalog_content'].apply(details).apply(pd.Series)
processed_df.columns = ['description','Value','Unit']
# print(processed_df['description'][5])
processed_df_test=test_df['catalog_content'].apply(details).apply(pd.Series)
processed_df_test.columns = ['description','Value','Unit']
aug_df=pd.concat([train_df,processed_df],axis=1)
aug_df.to_csv("aug_train.csv")
# print(data[0])
aug_df_test=pd.concat([test_df,processed_df_test],axis=1)
aug_df_test.to_csv("aug_test.csv")