diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
index b506456..38b4aff 100644
--- a/task-07/get_top_names.py
+++ b/task-07/get_top_names.py
@@ -5,12 +5,29 @@
"""
import pandas as pd
+import numpy as np
-def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
- # Add your code for Step 1 here
- # Just copy your answer from task-03
- # Remove the raise statement when you are done
- raise NotImplementedError
+def extract_data_lines(filename, start_text, end_text,
+ include_start=False, include_end=False):
+ """
+ open `filename`, and yield the lines between
+ the line that contains `start_text` and the line that contains `end_text`
+ """
+ to_yield = False
+ with open(filename) as fh:
+ for line in fh:
+ if end_text in line:
+ if include_end:
+ yield line
+ break
+ if include_start == False:
+ if to_yield:
+ yield line
+ if start_text in line:
+ to_yield = True
+ if include_start:
+ if to_yield:
+ yield line
if __name__ == '__main__':
@@ -19,16 +36,56 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl
end_text = ''
records = []
- data_lines = extract_data_lines(filename, start_text, end_text, include_start=True)
+ data_lines = extract_data_lines(filename, start_text,
+ end_text, include_start=True)
- # Add your code for Step 2 here
- # This will involve a for loop that iterates over `data_lines`
- # For each row, you will append a tuple to `records`
+ readyear = True # flag for reading line of years
- data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
+ for line in data_lines:
+ if readyear&('
| ' in line):
+ year = int(line.replace(' |
| ', '').replace(' | \n', ''))
+ readyear = False
+ elif '
| ' not in line:
+ line_10_names = line.replace(' | ', ' ').replace(" | ", " ").replace(" |
\n", " ").split()
+ for idx, name in enumerate(line_10_names):
+ gender = 'female' if idx < 5 else 'male'
+ rank = idx + 1 if idx < 5 else idx % 5 + 1
+ records.append((year, gender, rank, name))
+ readyear = True
+
+ data = pd.DataFrame.from_records(records,
+ columns=['year', 'gender', 'rank', 'name'])
# Add your code for Step 3 here
# You will use `data` to find and print out the answers for each questions listed in Task 4
- # For example, to answer question 1:
+ print('Q1: Which years Emma is the most chosen names?')
print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())
+ print('\n')
+
+
+ print('Q2:Which name had been the most chosen name for the longest consecutive years?')
+
+
+ print('\n')
+
+
+ print('Q3:How many unique male names have been on top 5 between years 1980 and 2000?')
+ name_male = np.array(data.query('gender == "male"').query('year >= 1980').query('year <= 2000')['name'].tolist())
+ print(len(np.unique(name_male)))
+ print('\n')
+
+
+ print('Q4:Are there more unique male names or more unique female names that are on top 5?')
+ n_male = len(np.unique(np.array(data.query('gender == "male"')['name'].tolist())))
+ n_female = len(np.unique(np.array(data.query('gender == "female"')['name'].tolist())))
+ if n_female > n_male: print('There are more unique female names.')
+ elif n_male > n_female: print('There are more unique male names.')
+ else: print('There are equal unique names for female and male.')
+ print('\n')
+
+
+ print('Q5:What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?')
+ name_males = np.unique(np.array(data.query('gender == "male"')['name'].tolist()))
+ # years_name = [data.query('name == "{}"'.format(name))['year'].tolist() for name in name_males]
+ print('\n')