From a227d715b0a95d12d4fc1a45d78cf50a1a15739d Mon Sep 17 00:00:00 2001 From: kmaster2520 Date: Wed, 14 Feb 2018 14:01:15 -0500 Subject: [PATCH 1/6] basic toc --- api/models/heuristics/easeofnavigation.py | 26 ++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index 893307e..76fe944 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -1,7 +1,31 @@ from models.heuristic import Heuristic +from boilerpipe.extract import Extractor # Formal 3 # Ensure ease of user navigation. class EaseOfNavigation(Heuristic): def score(self, eula): - return {'score': -1, 'max': 4, 'reason': 'Not implemented'} \ No newline at end of file + extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html) + text = extractor.getText() + text1000 = text[:1000] + tocscore = 0 + reason = '' + indexOfTable = -1 + + tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents'] + + for ind in tocindicators: + if indexOfTable < 0: + indexOfTable = text1000.find('TABLE OF CONTENTS') + else: + break + + if indexOfTable < 0: + tocscore = 0 + reason += 'No table of contents found near beginning of EULA. ' + else: + tocscore = 4 + reason += 'Found table of contents. ' + + + return {'score': tocscore, 'max': 4, 'reason': reason, 'text': text1000} \ No newline at end of file From 73eff957e9e39d21c466f7600d6c9ece657c7d7e Mon Sep 17 00:00:00 2001 From: kmaster2520 Date: Wed, 14 Feb 2018 14:03:48 -0500 Subject: [PATCH 2/6] add comment --- api/models/heuristics/easeofnavigation.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index 76fe944..e308fbb 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -5,21 +5,28 @@ # Ensure ease of user navigation. class EaseOfNavigation(Heuristic): def score(self, eula): + + # gets actual text since eula.text doesn't get everything extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html) text = extractor.getText() + + # limit text to first 1000 characters text1000 = text[:1000] - tocscore = 0 - reason = '' - indexOfTable = -1 + tocscore = 0 # score for table of contents + reason = '' # reason + indexOfTable = -1 # index of matching text + # text to look out for tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents'] + # find index of indicator text for ind in tocindicators: if indexOfTable < 0: indexOfTable = text1000.find('TABLE OF CONTENTS') else: break + # indexOfTable < 0 --> not found if indexOfTable < 0: tocscore = 0 reason += 'No table of contents found near beginning of EULA. ' From dd1f2a8914d4a90ca3d7145d8af9b546c7cf98da Mon Sep 17 00:00:00 2001 From: Mark Date: Fri, 16 Feb 2018 13:59:01 -0500 Subject: [PATCH 3/6] Changed naming conventions to match project, reduced redunant lines, added processing for non-html eulas, and error handled for html only eulas. --- api/models/heuristics/easeofnavigation.py | 30 +++++++++++------------ 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index e308fbb..5e3b774 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -6,33 +6,31 @@ class EaseOfNavigation(Heuristic): def score(self, eula): - # gets actual text since eula.text doesn't get everything - extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html) - text = extractor.getText() + if eula.html is not None: + # gets actual text since eula.text doesn't get everything + extractor = Extractor(extractor='KeepEverythingExtractor', html= eula.html) + text = extractor.getText() + else: + text = eula.text # limit text to first 1000 characters - text1000 = text[:1000] + text_first_1000 = text[:1000] tocscore = 0 # score for table of contents reason = '' # reason - indexOfTable = -1 # index of matching text + index_of_table = -1 # index of matching text # text to look out for tocindicators = ['TABLE OF CONTENTS', 'Table Of Contents'] # find index of indicator text for ind in tocindicators: - if indexOfTable < 0: - indexOfTable = text1000.find('TABLE OF CONTENTS') + if index_of_table < 0: + index_of_table = text_first_1000.find('TABLE OF CONTENTS') else: break - - # indexOfTable < 0 --> not found - if indexOfTable < 0: - tocscore = 0 - reason += 'No table of contents found near beginning of EULA. ' - else: - tocscore = 4 - reason += 'Found table of contents. ' + # Score of 4 if found, score of 0 if not + # TODO: additional scoring hyperlinked vs unlinked table + tocscore = 4 if index_of_table >= 0 else 0 - return {'score': tocscore, 'max': 4, 'reason': reason, 'text': text1000} \ No newline at end of file + return {'score': tocscore, 'max': 4, 'index': index_of_table} \ No newline at end of file From d15e630e231766654f801eff45c2641ba4bd8b78 Mon Sep 17 00:00:00 2001 From: kmaster2520 Date: Mon, 19 Feb 2018 12:04:20 -0500 Subject: [PATCH 4/6] fixed loop to check indicator rather than same phrase over and over again --- api/models/heuristics/easeofnavigation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index 5e3b774..9b26ba3 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -25,7 +25,7 @@ def score(self, eula): # find index of indicator text for ind in tocindicators: if index_of_table < 0: - index_of_table = text_first_1000.find('TABLE OF CONTENTS') + index_of_table = text_first_1000.find(ind) else: break From 6b61991cb5d608c5cf2956a4046b8a972222a1e0 Mon Sep 17 00:00:00 2001 From: kmaster2520 Date: Fri, 6 Apr 2018 11:59:07 -0400 Subject: [PATCH 5/6] use regex to find hyperlink --- api/models/heuristics/easeofnavigation.py | 38 +++++++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index 9b26ba3..8f70c38 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -1,6 +1,8 @@ from models.heuristic import Heuristic from boilerpipe.extract import Extractor +import re + # Formal 3 # Ensure ease of user navigation. class EaseOfNavigation(Heuristic): @@ -29,8 +31,38 @@ def score(self, eula): else: break - # Score of 4 if found, score of 0 if not - # TODO: additional scoring hyperlinked vs unlinked table - tocscore = 4 if index_of_table >= 0 else 0 + if eula.html: + + # find all table of contents entries and their hrefs + link_match = re.findall('(?<= 0.75 * expected_matches: + toc_hyperlinked = True + + + # Score of 4 if found, score of 0 if not + # TODO: additional scoring hyperlinked vs unlinked table + tocscore = 4 if toc_hyperlinked else 2 + else: + tocscore = 4 if index_of_table >= 0 else 0 return {'score': tocscore, 'max': 4, 'index': index_of_table} \ No newline at end of file From 198e963c9606a2508cecb9043526226e8adc2663 Mon Sep 17 00:00:00 2001 From: kmaster2520 Date: Fri, 6 Apr 2018 12:00:45 -0400 Subject: [PATCH 6/6] remove print statements --- api/models/heuristics/easeofnavigation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/models/heuristics/easeofnavigation.py b/api/models/heuristics/easeofnavigation.py index 8f70c38..8763bc0 100644 --- a/api/models/heuristics/easeofnavigation.py +++ b/api/models/heuristics/easeofnavigation.py @@ -35,12 +35,12 @@ def score(self, eula): # find all table of contents entries and their hrefs link_match = re.findall('(?<= 0.75 * expected_matches: toc_hyperlinked = True