diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
index b506456..060a67b 100644
--- a/task-07/get_top_names.py
+++ b/task-07/get_top_names.py
@@ -6,11 +6,32 @@
import pandas as pd
+
def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
+ """
+ open `filename`, and yield the lines between
+ the line that contains `start_text` and the line that contains `end_text`
+ """
# Add your code for Step 1 here
# Just copy your answer from task-03
# Remove the raise statement when you are done
- raise NotImplementedError
+
+ with open(filename) as fh:
+ in_table = False
+ for line in fh:
+
+ if start_text in line:
+ in_table = True
+ if not include_start:
+ continue
+
+ elif end_text in line:
+ if include_end:
+ yield line
+ break
+
+ if in_table:
+ yield line
if __name__ == '__main__':
@@ -25,10 +46,70 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl
# This will involve a for loop that iterates over `data_lines`
# For each row, you will append a tuple to `records`
+ # '
| 2017 | \n'
+ # 'Emma | Olivia | Ava | Isabella | Sophia | Liam | Noah | William | James | Logan |
\n'
+
+ for line in data_lines:
+ if 'align="center"' in line:
+ year = int(line.strip().split('| ')[1].split(' | ')[0])
+ else:
+ names = [it.split('')[0] for it in line.strip().split('')[1:]]
+ for ii, name in enumerate(names):
+ rank = (ii % 5) + 1
+ gender = 'female' if ii < 5 else 'male'
+ records.append((year, gender, rank, name))
+
data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
# Add your code for Step 3 here
# You will use `data` to find and print out the answers for each questions listed in Task 4
# For example, to answer question 1:
+ # 1. Which years Emma is the most chosen names?
print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())
+ print(data.query('name == "Emma" and rank == 1')['year'].tolist())
+
+ # 2. Which name had been the most chosen name for the longest consecutive years?
+ male = {'previous': '', 'cnt': 0, 'runs': []}
+ female = {'previous': '', 'cnt': 0, 'runs': []}
+ for __, it in data.query('rank == 1').iterrows():
+ dd = male if it.gender == 'male' else female
+ if it['name'] == dd['previous']:
+ dd['cnt'] += 1
+ else:
+ if dd['previous'] != '':
+ dd['runs'].append((dd['previous'], dd['cnt']))
+ dd['cnt'] = 1
+ dd['previous'] = it['name']
+
+ runs = pd.DataFrame(male['runs'] + female['runs'], columns=['name', 'cnt'])
+ print('Name with the longest run at the top:', runs.name[runs.cnt.idxmax()])
+
+ # 3. How many unique male names have be on top 5 between years 1980 and 2000?
+ print(len(set(data.query('year >= 1980 and year <= 2000 and gender == "male"')['name'])))
+
+ # 4. Are there more unique male names or more unique female names that are on top 5?
+ unique_male_names = len(set(data.query('gender == "male"')['name']))
+ unique_female_names = len(set(data.query('gender == "female"')['name']))
+
+ if unique_male_names > unique_female_names:
+ print('More unique male names.')
+ elif unique_male_names < unique_female_names:
+ print('More unique female names.')
+ else:
+ print('Same number of unique male and female names.')
+
+ # 5. What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?
+ previous = ''
+ count = 0
+ runs = []
+ for it in data.query('gender == "male" and rank == 1')['name']:
+ if it == previous:
+ count += 1
+ else:
+ if previous != '':
+ runs.append(count)
+ count = 1
+ previous = it
+
+ print('Distribution:\n', pd.value_counts(runs))
|