-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFile_Download.py
More file actions
86 lines (70 loc) · 3.44 KB
/
File_Download.py
File metadata and controls
86 lines (70 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import subprocess
import os
import time
from Variables import total_download_try, successful_download, unsuccessful_download
from Metadata_Extraction import process_folder
VALID_EXTENSIONS = [
'xls', 'xlsx', 'pdf', 'sql', 'doc', 'docx', 'pptx', 'zip', 'db', 'backup',
'apk', 'txt', 'csv', 'json', 'xml', 'html', 'css', 'js', 'epub', 'mobi',
'azw3', 'obj', 'stl', 'fbx', 'dae', '3ds', 'ply', 'avi', 'mov', 'mkv',
'psd', 'ai', 'dwg', 'dxf', 'sketch', 'cache', 'secret', 'config',
'md5', 'ini', 'img',
]
def download_file(url, download_folder, retries=3, timeout=30):
global unsuccessful_download
global successful_download
global total_download_try
file_name = url.split("/")[-1]
file_extension = file_name.split('.')[-1] if '.' in file_name else None
if not file_extension or file_extension not in VALID_EXTENSIONS:
file_name = f"{file_name}.zip"
file_path = os.path.join(download_folder, file_name)
os.makedirs(download_folder, exist_ok=True)
curl_command = [
"curl", "-L", "-o", file_path, url, "--max-time", str(timeout)
]
attempt = 0
while attempt < retries:
try:
subprocess.run(curl_command, check=True)
print(f"Downloaded: {file_name}")
successful_download += 1
total_download_try += 1
print(f"Successful: {successful_download} Unsuccessful: {unsuccessful_download} Total Downloads Tried: {total_download_try} Total Downloads To Try: {total_valid_snapshots}")
return True
except subprocess.CalledProcessError as e:
print(f"Attempt: {attempt + 1} Failed to download: {file_name} from: {url}. Error: {e}")
print(f"Successful: {successful_download} Unsuccessful: {unsuccessful_download} Total Downloads Tried: {total_download_try} Total Downloads To Try: {total_valid_snapshots}")
except Exception as e:
print(f"Attempt: {attempt + 1} Error while downloading: {file_name} from: {url}: {e}")
print(f"Successful: {successful_download} Unsuccessful: {unsuccessful_download} Total Downloads Tried: {total_download_try} Total Downloads To Try: {total_valid_snapshots}")
attempt += 1
time.sleep(5)
unsuccessful_download += 1
total_download_try += 1
print(f"Failed to download: {file_name} after: {retries} attempts.")
print(f"Successful: {successful_download} Unsuccessful: {unsuccessful_download} Total Downloads Tried: {total_download_try} Total Downloads To Try: {total_valid_snapshots}")
return False
def download_files(successful_snapshot):
global total_valid_snapshots
total_valid_snapshots = successful_snapshot
with open("valid_urls.txt", "r") as valid_file:
valid_urls = [url.strip() for url in valid_file.readlines()]
download_folder = "downloaded_files"
os.makedirs(download_folder, exist_ok=True)
successful_downloads = 0
failed_downloads = 0
for url in valid_urls:
success = download_file(url, download_folder)
if success:
successful_downloads += 1
else:
failed_downloads += 1
print(f"Finished downloading files. {successful_downloads} files downloaded.")
print(f"{failed_downloads} downloads failed.")
answer = input("Want to start metadata extraction? [Y/N]").lower()
if answer == "y":
print("Starting Metadata Extraction...")
process_folder(download_folder)
else:
print("Exit...")