astropgh · kuanweih · Oct 28, 2018 · Oct 29, 2018
diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
@@ -5,12 +5,29 @@
 """
 
 import pandas as pd
+import numpy as np
 
-def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
-    # Add your code for Step 1 here
-    # Just copy your answer from task-03
-    # Remove the raise statement when you are done
-    raise NotImplementedError
+def extract_data_lines(filename, start_text, end_text,
+                       include_start=False, include_end=False):
+    """
+    open `filename`, and yield the lines between
+    the line that contains `start_text` and the line that contains `end_text`
+    """
+    to_yield = False
+    with open(filename) as fh:
+        for line in fh:
+            if end_text in line:
+                if include_end:
+                    yield line
+                break
+            if include_start == False:
+                if to_yield:
+                    yield line
+            if start_text in line:
+                to_yield = True
+            if include_start:
+                if to_yield:
+                    yield line
 
 
 if __name__ == '__main__':
@@ -19,16 +36,56 @@ def extract_data_lines(filename, start_text, end_text, include_start=False, incl
     end_text = '</table></center></div><!-- end #content -->'
 
     records = []
-    data_lines = extract_data_lines(filename, start_text, end_text, include_start=True)
+    data_lines = extract_data_lines(filename, start_text,
+                                    end_text, include_start=True)
 
-    # Add your code for Step 2 here
-    # This will involve a for loop that iterates over `data_lines`
-    # For each row, you will append a tuple to `records`
+    readyear = True  # flag for reading line of years
 
-    data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
+    for line in data_lines:
+        if readyear&('<tr><td align="center">' in line):
+            year = int(line.replace('<tr><td align="center">', '').replace('</td>\n', ''))
+            readyear = False
+        elif '<tr><td align="center">' not in line:
+            line_10_names = line.replace('</td> <td>', ' ').replace("<td>", " ").replace("</td></tr>\n", " ").split()
+            for idx, name in enumerate(line_10_names):
+                gender = 'female' if idx < 5 else 'male'
+                rank = idx + 1 if idx < 5 else idx % 5 + 1
+                records.append((year, gender, rank, name))
+            readyear = True
+
+    data = pd.DataFrame.from_records(records,
+                                     columns=['year', 'gender', 'rank', 'name'])
 
     # Add your code for Step 3 here
     # You will use `data` to find and print out the answers for each questions listed in Task 4
 
-    # For example, to answer question 1:
+    print('Q1: Which years Emma is the most chosen names?')
     print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())
+    print('\n')
+
+
+    print('Q2:Which name had been the most chosen name for the longest consecutive years?')
+
+
+    print('\n')
+
+
+    print('Q3:How many unique male names have been on top 5 between years 1980 and 2000?')
+    name_male = np.array(data.query('gender == "male"').query('year >= 1980').query('year <= 2000')['name'].tolist())
+    print(len(np.unique(name_male)))
+    print('\n')
+
+
+    print('Q4:Are there more unique male names or more unique female names that are on top 5?')
+    n_male = len(np.unique(np.array(data.query('gender == "male"')['name'].tolist())))
+    n_female = len(np.unique(np.array(data.query('gender == "female"')['name'].tolist())))
+    if n_female > n_male: print('There are more unique female names.')
+    elif n_male > n_female: print('There are more unique male names.')
+    else: print('There are equal unique names for female and male.')
+    print('\n')
+
+
+    print('Q5:What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?')
+    name_males = np.unique(np.array(data.query('gender == "male"')['name'].tolist()))
+    # years_name = [data.query('name == "{}"'.format(name))['year'].tolist() for name in name_males]
+    print('\n')