From a227d715b0a95d12d4fc1a45d78cf50a1a15739d Mon Sep 17 00:00:00 2001
From: kmaster2520 <sathvik.kadaveru@gmail.com>
Date: Wed, 14 Feb 2018 14:01:15 -0500
Subject: [PATCH 1/6] basic toc

---
 api/models/heuristics/easeofnavigation.py | 26 ++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index 893307e..76fe944 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -1,7 +1,31 @@
 from models.heuristic import Heuristic
+from boilerpipe.extract import Extractor
 
 # Formal 3
 # Ensure ease of user navigation.
 class EaseOfNavigation(Heuristic):
     def score(self, eula):
-        return {'score': -1, 'max': 4, 'reason': 'Not implemented'}
\ No newline at end of file
+        extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html)
+        text = extractor.getText()
+        text1000 = text[:1000]
+        tocscore = 0
+        reason = ''
+        indexOfTable = -1
+
+        tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents']
+
+        for ind in tocindicators:
+            if indexOfTable < 0:
+                indexOfTable = text1000.find('TABLE OF CONTENTS')
+            else:
+                break
+        
+        if indexOfTable < 0:
+            tocscore = 0
+            reason += 'No table of contents found near beginning of EULA. '
+        else:
+            tocscore = 4
+            reason += 'Found table of contents. '
+
+
+        return {'score': tocscore, 'max': 4, 'reason': reason, 'text': text1000}
\ No newline at end of file

From 73eff957e9e39d21c466f7600d6c9ece657c7d7e Mon Sep 17 00:00:00 2001
From: kmaster2520 <sathvik.kadaveru@gmail.com>
Date: Wed, 14 Feb 2018 14:03:48 -0500
Subject: [PATCH 2/6] add comment

---
 api/models/heuristics/easeofnavigation.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index 76fe944..e308fbb 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -5,21 +5,28 @@
 # Ensure ease of user navigation.
 class EaseOfNavigation(Heuristic):
     def score(self, eula):
+
+        # gets actual text since eula.text doesn't get everything
         extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html)
         text = extractor.getText()
+
+        # limit text to first 1000 characters
         text1000 = text[:1000]
-        tocscore = 0
-        reason = ''
-        indexOfTable = -1
+        tocscore = 0 # score for table of contents
+        reason = '' # reason
+        indexOfTable = -1 # index of matching text
 
+        # text to look out for
         tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents']
 
+        # find index of indicator text
         for ind in tocindicators:
             if indexOfTable < 0:
                 indexOfTable = text1000.find('TABLE OF CONTENTS')
             else:
                 break
         
+        # indexOfTable < 0 --> not found
         if indexOfTable < 0:
             tocscore = 0
             reason += 'No table of contents found near beginning of EULA. '

From dd1f2a8914d4a90ca3d7145d8af9b546c7cf98da Mon Sep 17 00:00:00 2001
From: Mark <mclayton@gatech.edu>
Date: Fri, 16 Feb 2018 13:59:01 -0500
Subject: [PATCH 3/6] Changed naming conventions to match project, reduced
 redunant lines, added processing for non-html eulas, and error handled for
 html only eulas.

---
 api/models/heuristics/easeofnavigation.py | 30 +++++++++++------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index e308fbb..5e3b774 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -6,33 +6,31 @@
 class EaseOfNavigation(Heuristic):
     def score(self, eula):
 
-        # gets actual text since eula.text doesn't get everything
-        extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html)
-        text = extractor.getText()
+        if eula.html is not None:
+            # gets actual text since eula.text doesn't get everything
+            extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html)
+            text = extractor.getText()
+        else:
+            text = eula.text
 
         # limit text to first 1000 characters
-        text1000 = text[:1000]
+        text_first_1000 = text[:1000]
         tocscore = 0 # score for table of contents
         reason = '' # reason
-        indexOfTable = -1 # index of matching text
+        index_of_table = -1 # index of matching text
 
         # text to look out for
         tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents']
 
         # find index of indicator text
         for ind in tocindicators:
-            if indexOfTable < 0:
-                indexOfTable = text1000.find('TABLE OF CONTENTS')
+            if index_of_table < 0:
+                index_of_table = text_first_1000.find('TABLE OF CONTENTS')
             else:
                 break
-        
-        # indexOfTable < 0 --> not found
-        if indexOfTable < 0:
-            tocscore = 0
-            reason += 'No table of contents found near beginning of EULA. '
-        else:
-            tocscore = 4
-            reason += 'Found table of contents. '
 
+        # Score of 4 if found, score of 0 if not
+        # TODO: additional scoring hyperlinked vs unlinked table
+        tocscore = 4 if index_of_table >= 0 else 0
 
-        return {'score': tocscore, 'max': 4, 'reason': reason, 'text': text1000}
\ No newline at end of file
+        return {'score': tocscore, 'max': 4, 'index': index_of_table}
\ No newline at end of file

From d15e630e231766654f801eff45c2641ba4bd8b78 Mon Sep 17 00:00:00 2001
From: kmaster2520 <sathvik.kadaveru@gmail.com>
Date: Mon, 19 Feb 2018 12:04:20 -0500
Subject: [PATCH 4/6] fixed loop to check indicator rather than same phrase
 over and over again

---
 api/models/heuristics/easeofnavigation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index 5e3b774..9b26ba3 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -25,7 +25,7 @@ def score(self, eula):
         # find index of indicator text
         for ind in tocindicators:
             if index_of_table < 0:
-                index_of_table = text_first_1000.find('TABLE OF CONTENTS')
+                index_of_table = text_first_1000.find(ind)
             else:
                 break
 

From 6b61991cb5d608c5cf2956a4046b8a972222a1e0 Mon Sep 17 00:00:00 2001
From: kmaster2520 <sathvik.kadaveru@gmail.com>
Date: Fri, 6 Apr 2018 11:59:07 -0400
Subject: [PATCH 5/6] use regex to find hyperlink

---
 api/models/heuristics/easeofnavigation.py | 38 +++++++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index 9b26ba3..8f70c38 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -1,6 +1,8 @@
 from models.heuristic import Heuristic
 from boilerpipe.extract import Extractor
 
+import re
+
 # Formal 3
 # Ensure ease of user navigation.
 class EaseOfNavigation(Heuristic):
@@ -29,8 +31,38 @@ def score(self, eula):
             else:
                 break
 
-        # Score of 4 if found, score of 0 if not
-        # TODO: additional scoring hyperlinked vs unlinked table
-        tocscore = 4 if index_of_table >= 0 else 0
+        if eula.html:
+
+            # find all table of contents entries and their hrefs
+            link_match = re.findall('(?<=<a href="#).+?(?=")', eula.html)
+            print 'link matches', link_match
+
+            # find targets of table of contents hyperlinks
+            name_match = re.findall('(?<=<a name=").+?(?=")', eula.html)
+            name_match += re.findall('(?<=<a id=").+?(?=")', eula.html)
+            print 'name matches', name_match
+
+            toc_hyperlinked = False
+            matches = 0
+            expected_matches = len(link_match)
+            for i in link_match:
+                found_match = False
+                for j in name_match:
+                    if i == j:
+                        found_match = True
+                        break
+                if found_match:
+                    matches += 1
+            
+            print matches, expected_matches
+            if matches > 0.75 * expected_matches:
+                toc_hyperlinked = True
+                
+
+            # Score of 4 if found, score of 0 if not
+            # TODO: additional scoring hyperlinked vs unlinked table
+            tocscore = 4 if toc_hyperlinked else 2
+        else:
+            tocscore = 4 if index_of_table >= 0 else 0
 
         return {'score': tocscore, 'max': 4, 'index': index_of_table}
\ No newline at end of file

From 198e963c9606a2508cecb9043526226e8adc2663 Mon Sep 17 00:00:00 2001
From: kmaster2520 <sathvik.kadaveru@gmail.com>
Date: Fri, 6 Apr 2018 12:00:45 -0400
Subject: [PATCH 6/6] remove print statements

---
 api/models/heuristics/easeofnavigation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py
index 8f70c38..8763bc0 100644
--- a/api/models/heuristics/easeofnavigation.py
+++ b/api/models/heuristics/easeofnavigation.py
@@ -35,12 +35,12 @@ def score(self, eula):
 
             # find all table of contents entries and their hrefs
             link_match = re.findall('(?<=<a href="#).+?(?=")', eula.html)
-            print 'link matches', link_match
+            #print 'link matches', link_match
 
             # find targets of table of contents hyperlinks
             name_match = re.findall('(?<=<a name=").+?(?=")', eula.html)
             name_match += re.findall('(?<=<a id=").+?(?=")', eula.html)
-            print 'name matches', name_match
+            #print 'name matches', name_match
 
             toc_hyperlinked = False
             matches = 0
@@ -54,7 +54,7 @@ def score(self, eula):
                 if found_match:
                     matches += 1
             
-            print matches, expected_matches
+            #print matches, expected_matches
             if matches > 0.75 * expected_matches:
                 toc_hyperlinked = True