-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_data_and_transform_script.py
More file actions
71 lines (50 loc) · 2.79 KB
/
get_data_and_transform_script.py
File metadata and controls
71 lines (50 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#importing libraries
import jsonlines
import pandas as pd
from pandas import json_normalize
import inflection
#reading data
with jsonlines.open('./ga_sessions.json', 'r') as jsonl_f:
lst = [obj for obj in jsonl_f]
flattened_data_l1 = pd.json_normalize(lst, max_level = 1)
flattened_data_l1.head()
#########################Visits#########################
visits_cols = ['fullVisitorId', 'visitId', 'visitNumber', 'visitStartTime', 'device.browser', 'geoNetwork.country']
visits = flattened_data_l1.loc[:,visits_cols]
#changing visitiNumber to numeric
visits["visitNumber"] = pd.to_numeric(visits["visitNumber"])
visits["visitStartTime"] = pd.to_numeric(visits["visitStartTime"])
#changing visitstarttime to iso8601 timestamp (milliseconds) and then to string
visits['visitStartTime'] = visits['visitStartTime'] * 1000
visits['visitStartTime'] = pd.to_datetime(visits['visitStartTime'],unit='ms')
visits['visitStartTime'] = visits['visitStartTime'].astype(str)
#renaming columns in visits
visits.rename(columns= ({'device.browser': 'browser', 'geoNetwork.country': 'country'}), inplace = True)
#changing from camelCase to snake_case using inflection package
visits.rename(columns= (lambda x: inflection.underscore(x)), inplace = True)
#creating visits.json
visits.to_json('visits.json', orient='records', lines=True)
#########################Hits#########################
hits_cols = ['fullVisitorId', 'visitId', 'hits']
#Flattening data from lst -> stores json data
hits = pd.json_normalize(lst, record_path = ['hits'], meta = ['visitId', 'fullVisitorId','visitStartTime'])
hits = hits[['visitId','fullVisitorId','visitStartTime','hitNumber','type','time','page.pagePath','page.pageTitle','page.hostname']]
#changing hitnumber to numeric
hits["hitNumber"] = pd.to_numeric(hits['hitNumber'])
#changing visitStartTime to iso8601 timestamp
hits['visitStartTime'] = pd.to_numeric(hits['visitStartTime'])
hits['visitStartTime'] = hits['visitStartTime'] * 1000
#converting time to numeric
hits['time'] = pd.to_numeric(hits['time'])
#creating hit_timestamp
hits['hit_timestamp'] = hits['visitStartTime'] + hits['time']
hits['hit_timestamp'] = pd.to_datetime(hits['hit_timestamp'],unit='ms')
hits['hit_timestamp'] = hits['hit_timestamp'].astype(str)
#creating hit Id column -> hitId = visitId + fullVisitorId + hit_timestamp
hits["hit_id"] = hits['visitId'].astype(str) + '_' + hits['fullVisitorId'] + '_' + hits['hit_timestamp']
#dropping columns
hits.drop(columns = ['visitId','fullVisitorId','visitStartTime','time'], inplace = True)
hits.rename(columns= ({'page.pagePath': 'pagePath','page.pageTitle':'pageTitle', 'page.hostname': 'hostname','type': "hit_type"}), inplace = True)
hits.rename(columns= (lambda x: inflection.underscore(x)), inplace = True)
#creating hits.json
hits.to_json('hits.json', orient='records', lines=True)