From 8771e48f165462e2531ebc3680f7350fa7178e28 Mon Sep 17 00:00:00 2001 From: Kw Huang Date: Sun, 28 Oct 2018 19:00:22 -0400 Subject: [PATCH 1/2] done step 2 --- task-07/get_top_names.py | 53 +++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py index b506456..76e88a4 100644 --- a/task-07/get_top_names.py +++ b/task-07/get_top_names.py @@ -6,11 +6,28 @@ import pandas as pd -def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): - # Add your code for Step 1 here - # Just copy your answer from task-03 - # Remove the raise statement when you are done - raise NotImplementedError + +def extract_data_lines(filename, start_text, end_text, + include_start=False, include_end=False): + """ + open `filename`, and yield the lines between + the line that contains `start_text` and the line that contains `end_text` + """ + to_yield = False + with open(filename) as fh: + for line in fh: + if end_text in line: + if include_end: + yield line + break + if include_start == False: + if to_yield: + yield line + if start_text in line: + to_yield = True + if include_start: + if to_yield: + yield line if __name__ == '__main__': @@ -19,13 +36,25 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl end_text = '' records = [] - data_lines = extract_data_lines(filename, start_text, end_text, include_start=True) - - # Add your code for Step 2 here - # This will involve a for loop that iterates over `data_lines` - # For each row, you will append a tuple to `records` - - data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name']) + data_lines = extract_data_lines(filename, start_text, + end_text, include_start=True) + + readyear = True # flag for reading line of years + + for line in data_lines: + if readyear&('' in line): + year = line.replace('', '').replace('\n', '') + readyear = False + elif '' not in line: + line_10_names = line.replace(' ', ' ').replace("", " ").replace("\n", " ").split() + for idx, name in enumerate(line_10_names): + gender = 'female' if idx < 5 else 'male' + rank = idx + 1 if idx < 5 else idx % 5 + 1 + records.append((year, gender, rank, name)) + readyear = True + + data = pd.DataFrame.from_records(records, + columns=['year', 'gender', 'rank', 'name']) # Add your code for Step 3 here # You will use `data` to find and print out the answers for each questions listed in Task 4 From 65cba6520e9350b395cf12829d086ae41098ab8a Mon Sep 17 00:00:00 2001 From: Kw Huang Date: Sun, 28 Oct 2018 20:19:09 -0400 Subject: [PATCH 2/2] answered Q1,3,4 in a stupid way.. --- task-07/get_top_names.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py index 76e88a4..38b4aff 100644 --- a/task-07/get_top_names.py +++ b/task-07/get_top_names.py @@ -5,7 +5,7 @@ """ import pandas as pd - +import numpy as np def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): @@ -43,7 +43,7 @@ def extract_data_lines(filename, start_text, end_text, for line in data_lines: if readyear&('' in line): - year = line.replace('', '').replace('\n', '') + year = int(line.replace('', '').replace('\n', '')) readyear = False elif '' not in line: line_10_names = line.replace(' ', ' ').replace("", " ").replace("\n", " ").split() @@ -59,5 +59,33 @@ def extract_data_lines(filename, start_text, end_text, # Add your code for Step 3 here # You will use `data` to find and print out the answers for each questions listed in Task 4 - # For example, to answer question 1: + print('Q1: Which years Emma is the most chosen names?') print(data.query('name == "Emma"').query('rank == 1')['year'].tolist()) + print('\n') + + + print('Q2:Which name had been the most chosen name for the longest consecutive years?') + + + print('\n') + + + print('Q3:How many unique male names have been on top 5 between years 1980 and 2000?') + name_male = np.array(data.query('gender == "male"').query('year >= 1980').query('year <= 2000')['name'].tolist()) + print(len(np.unique(name_male))) + print('\n') + + + print('Q4:Are there more unique male names or more unique female names that are on top 5?') + n_male = len(np.unique(np.array(data.query('gender == "male"')['name'].tolist()))) + n_female = len(np.unique(np.array(data.query('gender == "female"')['name'].tolist()))) + if n_female > n_male: print('There are more unique female names.') + elif n_male > n_female: print('There are more unique male names.') + else: print('There are equal unique names for female and male.') + print('\n') + + + print('Q5:What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?') + name_males = np.unique(np.array(data.query('gender == "male"')['name'].tolist())) + # years_name = [data.query('name == "{}"'.format(name))['year'].tolist() for name in name_males] + print('\n')