Primary-care-code-release/main_script_3.py at main · NHSDigital/Primary-care-code-release · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
## POWER BI TABLE GENERATION (POWER BI TABLES FROM REFSET PORTAL)

# Runs SQL code for the Power BI tables, creates Excel files and txt files

"""
If there are changes to the PBI_Service_Full_Name_Mappings or PBI_Ruleset_Full_Name_Mappings Excel files (rare), these will be highlighted in the log and terminal messages. The code will stop running.
New sheets will be added to the Excel files copied to the current working directory so that the new and old Service types/Rulesets can be compared, and any relevant changes can be made to the Power BI tables in the shared drive.
"""

import logging
import time
import shutil
import pandas as pd, numpy as np
from pathlib import PurePath
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
import os

from src.utils.dates import class_Dates
from src.utils.databases import set_dbs_live_or_test, replace_db_dates_config_params
from src.utils.filepaths import filepaths
from src.utils.setup_fxns import log_setup, get_config, create_output_folder, move_outputs, replace_email_names_config_params, find_previous_defined_dates
from src.utils.file_fxns import txt_save, email, excel_save
from src.utils.connection_fxns import get_df_from_sql
from src.release_stages.update_Ruleset_Published import update_RP
from src.release_stages.power_bi import pbi_addsheet, pbi_change_check, pbi_refset_reldate_table, sql_to_excel_pbi
from sql.sql_power_bi import *

"""
Setup

- A logging file is created for run outputs and times, and any warnings or errors will be flagged here.
- User inputs are read in from the config file.
- Filepaths are defined.
- An outputs folder is created if it doesn't already exist (referred to throughout guidance as the publication folder, this is in a location specified in the config file, unless the process is run in test mode, then the outputs folder will remain in your repo).

Stages

1. Checks the existing mapping files in the root output location line up with the services and rulesets from the Cluster Management database.
2. Creates and exports the updated PowerBI xlsx files to your output folder. There should be 12.
3. Creates and exports the updated PowerBI txt files to your output folder. There should be 5.
4. Creates a draft email to the website development team to replace the txt zip file. Please note, this requires manual intervention before sending.

Please Note:
This code has a built in test mode for when it is being run as a trial or dev work is being done. For the official release run, please make sure test mode is turned off. This is done through the config file: test = false.
"""

#log file created in cwd
log_loc, log_name = log_setup("Script3")

#read in config file and set test to True/False
config = get_config('./setup/config.toml')

#find dates from config file and save into class 'Dates'
configDates = class_Dates(config['Dates'])

#replace dates in databases defined within config parameters
config = replace_db_dates_config_params(config, configDates)

#If in live mode, set databases classes (three separate) to contain live database/stored procedure names as properties
#If not in live mode, set database classes (three separate) to contain test database/stored procedure names as properties AND create/reset the test databases, as defined in the config, with the up to date records from the live databases.
test = config['Setup']['test_mode']
UKSNOMEDCT, LocalSNOMEDCT, ClusterManagement = set_dbs_live_or_test(config, test)

# Filepaths
filepath_dict = filepaths(config['Filepaths'], configDates)

#try to find dates used in previous script(s)
configDates = find_previous_defined_dates(test, filepath_dict, configDates, UKSNOMEDCT, LocalSNOMEDCT)

#define the email information from the config file and replace name
configEmails = replace_email_names_config_params(config)
email_dict = config['Autogenerated_Emails']['Email_5']

#output fldr created in cwd
create_output_folder(filepath_dict)

#-------------------------------------------------------------------------------
#Check the Rulesets Published table is up to date
errortype = update_RP(ClusterManagement, configDates.PCDreleaseDate, output=filepath_dict['publishing_TRUD_fldr'])
#raise an Exception if the error type is fatal (1), or the user has not checked the output.

if errortype == 1:
    raise Exception('Errors have been found in the Ruleset Publication table. Please fix and rerun. CHECK_Ruleset_Published_table.csv can be found in your outputs folder.')
#if the ruleset inactive date is greater than the PCD Release date, the ruleset is still active and the content is relevant for the Power BI tables.
elif errortype == 0:
    pass
else:
    checked = input("Please check the CHECK_Ruleset_Published_table.csv in your outputs folder. The 'Publication_issues' column will flag any rulsets which look to have potential errors. Please confirm you have checked this table and are happy with the results. [y/n]\n")
    if checked != "y":
        raise Exception("Please check the CHECK_Ruleset_Published_table.csv and rerun script.")
#-------------------------------------------------------------------------------
### Power BI Report Tables ###

# Mapping Code

t1_start = time.perf_counter()
#Create an initial dataframe to be used in full name mapping tables checks
ServiceRulesetIDs_df = get_df_from_sql(server=ClusterManagement.server,
                                       database=ClusterManagement.db,
                                       query=sql_query_ServiceRulesetIDs(ClusterManagement))

# Service Full Name Mappings checks
# Checks currently active services against the PBI_Service_Full_Name_Mappings file used by the Power BI report.
# Normally, there will be no changes, and a confirmation message will be displayed.
# If there are any differences, then service(s) may need to be added/removed from the PBI_Service_Full_Name_Mappings file.
# A worksheet is added so that the new and old lists can be compared.

#Create a ServiceIDs_df dataframe of currently active services.
ServiceIDs_df = ServiceRulesetIDs_df.drop_duplicates(subset=['Service_ID'], ignore_index = True)
ServiceIDs_df = ServiceIDs_df.drop('Ruleset_ID', axis=1)
ServiceIDs_df = ServiceIDs_df.sort_values(by=["Service_ID"])
ServiceIDs_df.reset_index(drop = True, inplace = True)

#Read the PBI_Service_Full_Name_Mappings.xlsx file from shared drive.
#Compare with the ServiceIDs_df dataframe just made.
ServiceNameMapSourceLoc = PurePath(filepath_dict['rootPowerBIxlsx'], "PBI_Service_Full_Name_Mappings.xlsx")
ServiceNameMap_df = pd.read_excel(ServiceNameMapSourceLoc)
ServiceNameMap_df = ServiceNameMap_df.sort_values(by=["Service"])
comparesvc_df = ServiceIDs_df.merge (ServiceNameMap_df, how='outer', on=None, left_on='Service_ID', right_on='Service',  sort=False,  copy=None, indicator=False, validate=None)
comparesvc_df["Service_ID_Match"] = np.where(comparesvc_df["Service_ID"] == comparesvc_df["Service"].values, True, False)

#Check for differences and create an alert text.
ServiceChangeFlag, serviceMappingsAlertText  = pbi_change_check(comparesvc_df, "Service", ServiceIDs_df, filepath_dict, ServiceNameMapSourceLoc, ClusterManagement.db, config)

# Ruleset Full Name Mappings
# Checks currently active rulesets against the PBI_Ruleset_Full_Name_Mappings file used by the Power BI report.
# Normally, there will be no changes, and a confirmation message will be displayed.
# If there are any differences, then rulesets(s) may need to be added/removed from the PBI_Ruleset_Full_Name_Mappings file.
# A worksheet is added so that the new and old lists can be compared.

#Create a RulesetsIDs_df dataframe of currently active rulesets.
RulesetIDs_df = ServiceRulesetIDs_df.drop_duplicates (subset=['Ruleset_ID'], ignore_index = True)
RulesetIDs_df = RulesetIDs_df.drop('Service_ID', axis=1)
RulesetIDs_df = RulesetIDs_df.sort_values(by=["Ruleset_ID"])
RulesetIDs_df.reset_index(drop = True, inplace = True)

#Read the Power BI table currently in use from Shared drive
#Compare with the RulesetIDs_df dataframe just made.
RulesetNameMapSourceLoc = PurePath(filepath_dict['rootPowerBIxlsx'], "PBI_Ruleset_Full_Name_Mappings.xlsx")
RulesetNameMap_df = pd.read_excel(RulesetNameMapSourceLoc)
RulesetNameMap_df = RulesetNameMap_df.sort_values(by=["Ruleset"])
comparers_df =  RulesetIDs_df.merge(RulesetNameMap_df, how='outer', on=None, left_on='Ruleset_ID', right_on='Ruleset',  sort=False,  copy=None, indicator=False, validate=None)
comparers_df["Ruleset_ID_Match"] = np.where(comparers_df["Ruleset_ID"] == comparers_df["Ruleset"].values, True, False)

#Check for differences and create an alert text.
RulesetChangeFlag,rulesetMappingsAlertText  = pbi_change_check(comparers_df, "Ruleset", RulesetIDs_df, filepath_dict, RulesetNameMapSourceLoc, ClusterManagement.db, config)

# Final Full Name Mappings Checks and alerts
# Checks for changes in Service/Ruleset IDs (e.g. new services/rulesets).
# If found, creates an exception text with instructions.
# Once any relevant amendments have been made, run the script again.

exceptionAlertText = f"{serviceMappingsAlertText}\n{rulesetMappingsAlertText}"

if ServiceChangeFlag == True or RulesetChangeFlag == True:
    message = f"\n\nGPDESA INSTRUCTIONS:\nChange(s) to Services/Rulesets (see below for details):\n{exceptionAlertText}\nMake the required changes to the relevant Full_Name_Mappings workbook(s) and run this script again."
    logging.critical(message)
    raise Exception(message)

#-------------------------------------------------------------------------------------------------------------------

## Tables Code ##

pbi_tables_filenames_dict = { # This returns all the published rulesets (Query A).
                              sql_query_ServiceRulesetIDs: 'PBI_Rulesets_Service_Rulesets_df.xlsx',
                              # Use the Cluster_Ruleset table to join to the table created in Query A on Service_ID, Ruleset_ID and Ruleset_Version. # This returns all the clusters used by the published rulesets.
                              sql_query_ClusterRuleset: 'PBI_Cluster_Ruleset_df.xlsx',
                              # Use the Clusters table to join to the table created in Query B on Cluster_ID.# This returns the Cluster_ID, Cluster_Description and Refset_ID for all the clusters used by the published rulesets.
                              sql_query_Clusters: 'PBI_Clusters_df.xlsx',
                              # Use the Output_Ruleset table to join to the table created in Query A on Service_ID, Ruleset_ID and Ruleset_Version.# This returns the Output_IDs and Output_Versions used by the published rulesets.
                              sql_query_OutputRuleset: 'PBI_Output_Ruleset_df.xlsx',
                              # Use the Outputs table to join to the table created in Query D on Output_ID and Output_Version.# This returns the Output_Description and Output_Type for those outputs used by the published rulesets.
                              sql_query_Outputs: 'PBI_Outputs_df.xlsx',
                              # Use the Population_Ruleset table to join to the table created in Query A on Service_ID, Ruleset_ID and Ruleset_Version. # This returns the Population_IDs and Population_Versions for those populations used by the published rulesets.
                              sql_query_PopulationRuleset:'PBI_Population_Ruleset_df.xlsx' ,
                              # Use the Populations table to join to the table created in Query F on Population_ID and Population_Version.# This returns the Population_Description and Population_Type for those populations used by the published rulesets.
                              sql_query_Populations: 'PBI_Populations_df.xlsx',
                              # Use the Cluster_Population table to join to the table created in Query G and the table created in Query C.# This returns the Population_IDs, Population_Versions and Cluster_IDs for those populations used by the published rulesets.
                              sql_query_ClusterPopulation: 'PBI_Cluster_Population_df.xlsx',
                              # Use the Output_Population table to join to the table created in Query F and the table created in Query E.# This returns the Population_IDs and Population_Versions and which Output_IDs and Output_Versions are used by them in the published rulesets.
                              sql_query_OutputPopulation: 'PBI_Output_Population_df.xlsx',
                              # Use the Cluster_Output table to join to the table created in Query D and the table created in Query B.# This returns the Output_IDs and the Cluster_IDs that are used by these outputs in the published rulesets.
                              sql_query_ClusterOutput: 'PBI_Cluster_Output_df.xlsx'}

#run loop of returning sql tables and saving these as excel files
for key, value in pbi_tables_filenames_dict.items():
    sql_to_excel_pbi(key, ClusterManagement, filepath_dict['PowerBIxlsx_fldr'], value)

# Use the Refsets_Full_Ongoing table to join to the table created in Query C and the SCT_DESCRIPTION table.
# Unions to a further table at the end to bring in all the non-PCD clusters (mainly but not exclusively, drug refsets) so the cluster is included in this table but the codes aren't, there is an instruction for the user to check these in the SNOMED browser when they select this cluster in the Power BI portal.
# This returns the cluster/refset details along with their SNOMED codes for all the clusters used in the published rulesets.

PBI_df = get_df_from_sql(server=ClusterManagement.server,
                         database=ClusterManagement.db,
                         query=sql_query_ClusterRefsetContent(ClusterManagement, LocalSNOMEDCT, UKSNOMEDCT))

filename_pbi = 'PBI_Cluster_Refset_Content_df.xlsx'
#Export to Excel - Shared drive
excel_save(df=PBI_df, filepath= filepath_dict['PowerBIxlsx_fldr'], filename= filename_pbi)
logging.info(f'Creating {filename_pbi[:-8]} table from SQL and saving as an excel sheet complete.')

## Refset Release Date Table create and export ##
# Creates and exports a table with the latest refset release date from config for use as a card in the Power BI report.
pbi_refset_reldate_table(config, filepath_dict)

t1_end = time.perf_counter()
logging.info(f"PBI report table creations completed in {(t1_end - t1_start):0.2f} seconds.")

#--------------------------------------------------------------------------------------
## POWER BI REPORT TXT FILES
#--------------------------------------------------------------------------------------

t2_start = time.perf_counter()

# 1. PCD_Refset_Content file

TRUDRel = configDates.PCDreleaseDate
logging.info(f"Starting .txt files creation and export using date {TRUDRel}. This process takes approximately 2 minutes.")

txtfile1_df = get_df_from_sql(server=ClusterManagement.server,
                              database=ClusterManagement.db,
                              query=sql_query_PCDRefsetContentFile(ClusterManagement,
                                                      LocalSNOMEDCT,
                                                      UKSNOMEDCT))
txtfile1_df = txtfile1_df.astype('string')
#logging.info(txtfile1_df)

#Export to Excel - Power BI Improvements folder - txt files folder
txt_save(df=txtfile1_df,
         filepath=filepath_dict['PowerBItxt_fldr'],
         filename=str(TRUDRel) + '_PCD_Refset_Content.txt')

# 2. Output_LookUp file

txtfile2_df = get_df_from_sql(server=ClusterManagement.server,
                              database=ClusterManagement.db,
                              query=sql_query_OutputDescFile(ClusterManagement))
txtfile2_df = txtfile2_df.astype('string')
#logging.info(txtfile2_df)

#Export to Excel - Power BI Improvements folder - txt files folder
txt_save(df=txtfile2_df,
         filepath=filepath_dict['PowerBItxt_fldr'],
         filename= str(TRUDRel) + '_PCD_Output_Descriptions.txt')

# 3. PCD_Refset_Content_by_Output file

txtfile3_df = get_df_from_sql(server=ClusterManagement.server,
                              database=ClusterManagement.db,
                              query=sql_query_RefsetByOutputFile(ClusterManagement,
                                                                 LocalSNOMEDCT,
                                                                 UKSNOMEDCT))

txtfile3_df = txtfile3_df.astype('string')
#logging.info(txtfile3_df)

#Export to Excel - Power BI Improvements folder - txt files folder
txt_save(df=txtfile3_df,
         filepath=filepath_dict['PowerBItxt_fldr'],
         filename=str(TRUDRel) + '_PCD_Refset_Content_by_Output.txt')


# 4. PCD_Service_Full_Name_Mappings file

# Reads in the PBI_Service_Full_Name_Mappings.xlsx and creates a df
# Drops the Display_Order column as this is not needed for the publication (it is only needed to to order how the Services are listed in the Power BI portal)
txtfile4_df = ServiceNameMap_df.drop('Display_Order', axis=1)

#Export to Excel - Power BI Improvements folder - txt files folder
txt_save(df=txtfile4_df,
         filepath=filepath_dict['PowerBItxt_fldr'],
         filename=str(TRUDRel) + '_PCD_Service_Full_Name_Mappings.txt')

# 5. PCD_Ruleset_Full_Name_Mappings file

# Reads in the PBI_Ruleset_Full_Name_Mappings.xlsx and creates a df
txtfile5_df = RulesetNameMap_df

#Export to Excel - Power BI Improvements folder - txt files folder
txt_save(df=txtfile5_df,
         filepath=filepath_dict['PowerBItxt_fldr'],
         filename=str(TRUDRel) + '_PCD_Ruleset_Full_Name_Mappings.txt')


t2_end = time.perf_counter()
if divmod((t2_end - t2_start),60)[0] >= 2:
    pluralchar = "s"
else:
    pluralchar = ""
logging.info(f"Txt file creation completed in {(t2_end - t2_start):0.2f} seconds ({divmod((t2_end - t2_start),60)[0]:0.0f} minute{pluralchar} {divmod((t2_end - t2_start),60)[1]:0.2f} seconds).")

# WEBSITE UPDATE EMAIL
#Email body
body = email_dict['body']
body = body.replace('\n', '<br>')
body = body.replace('<img>', '<img src="cid:image1">')
body = MIMEText(body,'html') #must be html to include image
#image
fp = open(PurePath(email_dict['image_embed']), 'rb')
image = MIMEImage(fp.read())
fp.close()
image.add_header('Content-ID', '<image1>')

#composition of email 3
email(email_subject=email_dict['subject'],
      email_To=email_dict['To'],
      email_Cc=email_dict['Cc'],
      email_body=body,
      outfile_path=filepath_dict['publishing_TRUD_fldr'],
      outfile_name=f"{email_dict['subject']}.eml",
      attach=False,
      email_image=image)

#move service and ruleset mapping files to shared drive location if not in test mode
if test == 'false':
    shutil.copy(ServiceNameMapDestLoc,ServiceNameMapSourceLoc)
    shutil.copy(RulesetNameMapDestLoc,RulesetNameMapSourceLoc)
else:
    pass

#move outputs to publication folder if in live mode, otherwise leave in cwd
filepath_dict = move_outputs(filepath_dict, script=3, test=test)

logging.info('Script 3 completed successfully. Please read through the log for any warning messages.')
logging.shutdown()

#move logger to publication folder if in live mode, otherwise leave in cwd
shutil.move(PurePath(log_loc, log_name), PurePath(filepath_dict['publishing_TRUD_fldr'], log_name))