-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data.py
More file actions
150 lines (111 loc) · 5.18 KB
/
Copy pathclean_data.py
File metadata and controls
150 lines (111 loc) · 5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from pathlib import Path
import pandas as pd
import os
import re
import math
from utils import time_idx
raw_weather_type = ['薄曇','晴れ','くもり','少雨','弱い雨','強い雨','激しい雨','猛烈な雨','みぞれ(弱い)','雪 (強い)','雪 (弱い)','みぞれ(強い)']
weather_type = ['light cloud', 'sunny', 'cloudy', 'light rain', 'light rain', 'strong rain', 'heavy rain', 'heavy rain', 'sleet (weak)', 'snow (heavy)' ,'snow (weak)','sleet (strong)']
def save_xls(dict_df, path):
writer = pd.ExcelWriter(path)
for key in dict_df:
dict_df[key].to_excel(writer, key)
writer.save()
def init_dir(path):
cloud_dir = path.glob(f"cloud*.csv")
solar_dir = path.glob(f"solar*.csv")
weather_dir = path.glob(f"weather*.csv")
e_generator_dir = path.glob(f"*solar*.xlsx")
e_demand_dir = path.glob(f"*surplus*.xlsx")
return cloud_dir, solar_dir, weather_dir, e_generator_dir, e_demand_dir
def csv_process(dataset):
result = list()
# Process execution date
def target_date(sample):
date, time = sample.split("T")
time = time.split(":")
time = time[0] + ":" + time[1]
return date, time
for index, data in dataset.iterrows():
sample = dict()
#Target Date
date, time = target_date(data["target_date"])
sample["date"] = date + " " + time + ":00"
sample["time"] = time
#Features
sample["cloud"] = data["cloud(%)"]
sample["solar"] = data["solar(W/m2)"]
try:
sample["telop_name"] = weather_type[raw_weather_type.index(data["telop_name"])]
except:
sample["telop_name"] = data["telop_name"]
result.append(sample)
return result
def create_csv(cloud_dir, solar_dir, weather_dir):
# Reconstruct Path Glob
cloud_dir = list(cloud_dir)[0]
solar_dir = list(solar_dir)[0]
weather_dir = list(weather_dir)[0]
cloud_data = pd.read_csv(cloud_dir)
solar_data = pd.read_csv(solar_dir)
weather_data = pd.read_csv(weather_dir)
csv_data = pd.merge(weather_data, solar_data, how = 'outer', on = ["target_date", "execution_date"])
csv_data = pd.merge(csv_data, cloud_data, how = 'outer', on = ["target_date", "execution_date"])
# CSV Process
csv_data = pd.DataFrame(csv_process(csv_data))
return csv_data
def xlsx_process(path):
result = list()
for dir in list(path):
f = pd.ExcelFile(dir)
for sheet_name in f.sheet_names:
# Storage temp xlsx file
df = f.parse(sheet_name = sheet_name)
df = df.drop(labels = [0,1,2,3], axis = 0)
df = df.to_excel("temp.xlsx", header = None)
# Read xlsx with right format
df_new = pd.read_excel("temp.xlsx", index_col = None)
for index, data in df_new.iterrows():
solar_state = 0
sample = dict()
sample["date"] = sheet_name + " " + data["時刻"] + ":00"
sample["time"] = data["時刻"]
for solar_panel in data.index:
if (solar_panel == "30101:電力量(Wh)"):
sample["power_demand"] = data[solar_panel]
elif (solar_panel == "30101:回生電力量(Wh)"):
sample["power_surplus"] = data[solar_panel]
elif (len(str(solar_panel)) > 2):
if solar_state == 0:
solar_state = 1
if (math.isnan(data[solar_panel])):
break
else:
sample["power_generation"] = int(data[solar_panel])
else:
sample["power_generation"]+=data[solar_panel]
result.append(sample)
return result
def create_xlsx(e_generator_dir, e_demand_dir):
e_generator_data = pd.DataFrame(xlsx_process(e_generator_dir))
e_demand_data = pd.DataFrame(xlsx_process(e_demand_dir))
xlsx_data = pd.merge(e_generator_data, e_demand_data, how = "outer", on = ["date", "time"])
return xlsx_data
def clean_dataset(args):
path = args.data_dir + "/" + args.station
path = Path(path)
# Read directory of input files
cloud_dir, solar_dir, weather_dir, e_generator_dir, e_demand_dir = init_dir(path)
# Generate csv/xlsx after preprocessing
csv_data = create_csv(cloud_dir, solar_dir, weather_dir)
xlsx_data = create_xlsx(e_generator_dir, e_demand_dir)
#Combine all files into only one DataFrame
station_data = pd.merge(xlsx_data, csv_data, how = "outer", on = ["date", "time"])
station_data[["year", "month", "day"]] = station_data.date.str.split("-", expand = True)
station_data["day"] = station_data["day"].apply(lambda x: int(x[:2]))
station_data["date"] = pd.to_datetime(station_data["date"])
station_data["half_hours_from_start"] = pd.DataFrame(time_idx(station_data["date"]))
station_data["weekday"] = station_data["date"].dt.dayofweek
station_data["group"] = str(0)
#station_data.to_excel("test.xlsx")
return station_data