-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprogram.py
More file actions
98 lines (86 loc) · 3.57 KB
/
program.py
File metadata and controls
98 lines (86 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import xgboost as xgb
from features.URL_stats import features
from features.dynamic_stats import features1, connection, whois_connect, connection_1
import pandas as pd
import tldextract
from colorama import init, Fore, Style
import pickle
shorteners_list = [
"bit.ly", "bitly.kr", "bl.ink", "buff.ly", "clicky.me", "cutt.ly",
"dub.co", "fox.ly", "gg.gg", "han.gl", "is.gd", "kurzelinks.de",
"kutt.it", "lstu.fr", "oe.cd", "ow.ly", "rebrandly.com", "reduced.to",
"rip.to", "san.aq", "short.io", "shorten-url.com", "shorturl.at",
"spoo.me", "switchy.io", "tinu.be", "tinyurl.com", "t.ly", "urlr.me",
"v.gd", "vo.la", "yaso.su", "zlnk.com", "sor.bz", "73.nu", "lyn.bz",
"shlink.io", "yourls.org", "polr.me",
"git.io", "goo.gl", "me2.do", "cutit.org", "s2r.co", "soo.gd", "hoy.kr", "tr.ee"
]
static_model = xgb.XGBClassifier()
static_model.load_model("models/static_model.json")
dynamic_model = xgb.XGBClassifier()
dynamic_model.load_model("models/dynamic_model.json")
model_xgb = xgb.XGBClassifier()
model_xgb.load_model("models/meta_model_1.json")
with open("models/meta_model_lr.pkl", "rb") as f:
model = pickle.load(f)
whitelist = pd.read_csv("majestic_million.csv", usecols=["Domain"], nrows=100000)
whitelist_domains = set(whitelist['Domain'])
with open("data_collection/spam.txt", "r", encoding="utf-8") as f:
keywords = f.read()
dd = pd.read_csv("features/top500Domains.csv", usecols=["Root Domain"])
popular_domains = set(dd["Root Domain"])
url = input("Enter URL:")
url = url.replace("www.","", 1)
response, score = connection(url)
if response is None:
print(Fore.RED +"Invalid URL or website does not exist anymore!" + Style.RESET_ALL)
exit()
ext = tldextract.extract(url)
root_domain = (ext.domain + "." + ext.suffix).lower()
if ext.subdomain:
full_domain = ext.subdomain + "." + ext.domain + "." + ext.suffix
full_domain1 = ext.subdomain + "." + ext.domain
else:
full_domain = root_domain
full_domain1 = ext.domain
domain = full_domain.lower().removeprefix("www.")
domain1 = full_domain1.lower().removeprefix("www.")
is_whitelist = False
if (domain in whitelist_domains or domain1 in whitelist_domains):
is_whitelist = True
for element in shorteners_list:
if element in domain:
is_whitelist = False
break
if is_whitelist:
print(Fore.GREEN + "URL is on whitelist: 0% PHISH" + Style.RESET_ALL)
else:
#static
static_results = features(url, popular_domains)
static_data = pd.DataFrame([static_results])
y_proba = static_model.predict_proba(static_data)[:,1]
print(f"STATIC: This site is {y_proba[0] * 100:.2f}% phishing")
#dynamic
driver = connection_1()
w, available = whois_connect(url)
try:
driver.get(url)
except:
print("No dynamic and meta result: This website does not exist / cannot reach the host")
driver.quit()
exit()
dynamic_results = features1(url, response, driver, w, score, keywords, available)
data = pd.DataFrame([dynamic_results])
data = data.apply(pd.to_numeric, errors='coerce')
y_proba2 = dynamic_model.predict_proba(data)[:,1]
print(f"DYNAMIC: This site is {y_proba2[0] * 100:.2f}% phishing")
#meta model
X = pd.DataFrame({
"static_model": y_proba,
"dynamic_model": y_proba2
})
meta_proba = model.predict_proba(X)[:,1]
meta_proba_xgb = model_xgb.predict_proba(X)[:,1]
print(f"META MODEL LR: This site is {meta_proba[0] * 100:.2f}% phishing")
print(f"META MODEL XGB: This site is {meta_proba_xgb[0] * 100:.2f}% phishing")
driver.quit()