-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatatransfer.py
More file actions
134 lines (104 loc) · 4.82 KB
/
Copy pathdatatransfer.py
File metadata and controls
134 lines (104 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
## Script transfer data from OOI Piweb server to ODL NAS
import logging
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
from pathlib import Path
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
# Set up logging
logging.basicConfig(
filename='transfer_log.log', # Log file name
filemode='a', # Append to the log file
format='%(asctime)s - %(levelname)s - %(message)s',
level=logging.INFO # Log only INFO level and above
)
logger = logging.getLogger()
def list_files_in_directory(http_url):
"""List all files in the given HTTP directory and subdirectories."""
response = requests.get(http_url)
response.raise_for_status() # Raise an error for bad responses
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
file_links = []
folder_links = []
for link in soup.find_all('a'):
href = link.get('href')
full_url = urljoin(http_url, href)
if href.endswith('/'): # It's a directory
folder_links.append(full_url)
else: # It's a file
file_links.append(full_url)
return file_links, folder_links
def create_nested_directory(nas_base_path, http_url):
"""Create a nested directory structure on the NAS based on the HTTP URL."""
relative_path = http_url.replace("http://", "").replace("https://", "").replace("/", os.sep)
nas_path = os.path.join(nas_base_path, relative_path)
# Create directories if they do not exist
os.makedirs(os.path.dirname(nas_path), exist_ok=True)
return nas_path
def get_request_with_retry(url, retries=3, backoff_factor=0.3):
"""Get a request with retries and backoff for robustness against network issues."""
session = requests.Session()
retry = Retry(total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session.get(url, stream=True)
def stream_file_to_nas(http_url, nas_path):
"""Stream a file from an HTTP server directly to the NAS with retries and improved path handling."""
response = get_request_with_retry(http_url)
if response.status_code == 200:
total_size = int(response.headers.get('content-length', 0)) # Get total file size
nas_path = Path(nas_path)
# If file exists and size matches, skip the download
if nas_path.exists() and nas_path.stat().st_size == total_size:
return # Skip the file
# Stream and save the file
with open(nas_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1048576): # Larger chunk size for faster transfer
if chunk: # Filter out keep-alive new chunks
f.write(chunk)
logger.info(f"Streamed file {os.path.basename(nas_path)} to {os.path.basename(nas_path.parent)}")
else:
logger.error(f"Failed to download file {http_url}. HTTP Status code: {response.status_code}")
def download_and_stream_files(http_url, nas_base_directory):
# Get list of files and folders
file_links, folder_links = list_files_in_directory(http_url)
# Loop through each file link (if any)
if file_links:
folder_name = file_links[0].split('/')[-2] # Get the folder name
with tqdm(total=len(file_links), desc=f"Downloading {folder_name}") as progress_bar:
for file_url in file_links:
nas_path = create_nested_directory(nas_base_directory, file_url) # Create necessary directories
try:
# Stream the file directly to the NAS
stream_file_to_nas(file_url, nas_path)
except Exception as e:
logger.error(f"Failed to stream {file_url} to {nas_path}: {e}")
progress_bar.update(1)
# Recursively process each folder, until file_links is not empty anymore
for folder_url in folder_links:
download_and_stream_files(folder_url, nas_base_directory) # Recursive call for subdirectories
# To mount the NAS as a local storage use, with write permissions, use the following commands:
#
# ```bash
# sudo apt-get install cifs-utils
# sudo mkdir -p /media/odl_nas
# sudo mount -t cifs -o credentials=/home/<user>/.smbcredentials,dir_mode=0777,file_mode=0777 //odl.ocean.washington.edu/ODL /media/odl_nas
# ```
#
# After creating `.smbcredentials` with this format:
#
# ```bash
# username=your_username
# password=your_password
# ```
if __name__ == "__main__":
# Example usage:
http_url = "http://piweb.ooirsn.uw.edu/das/"
nas_directory = "/media/odl_nas/ODLdata/ooiDAS/"
# Stream the file directly to NAS
download_and_stream_files(http_url, nas_directory)