diff --git a/docs/sphinx-docs/source/user/menu_bar.rst b/docs/sphinx-docs/source/user/menu_bar.rst index 0ab0c88d84..a969e2b2f4 100644 --- a/docs/sphinx-docs/source/user/menu_bar.rst +++ b/docs/sphinx-docs/source/user/menu_bar.rst @@ -11,7 +11,8 @@ The File option allows you load data into *SasView* for analysis, or to save the Data can be loaded one file at a time, or by selecting multiple files, or by loading an entire folder of files (in which case *SasView* will attempt to make an intelligent guess as to what to load based on the file formats it recognises in the folder!). Data can also be loaded by dragging and dropping files directly -onto Data Explorer. +onto Data Explorer. Additionally, datasets can be downloaded directly from the SASBDB (Small Angle Scattering +Biological Data Bank) using **File > Load from SASBDB...** (see :ref:`SASBDB_Download` for details). A *SasView* session can also be saved and reloaded as an 'Analysis' (an individual model fit or invariant calculation, etc), or as a 'Project' (everything you have done since starting your *SasView* session). diff --git a/docs/sphinx-docs/source/user/tools.rst b/docs/sphinx-docs/source/user/tools.rst index 23c6fcc674..f30eb2c9d9 100644 --- a/docs/sphinx-docs/source/user/tools.rst +++ b/docs/sphinx-docs/source/user/tools.rst @@ -32,3 +32,5 @@ Tools & Utilities MuMag Tool + SASBDB Download + diff --git a/src/sas/qtgui/MainWindow/GuiManager.py b/src/sas/qtgui/MainWindow/GuiManager.py index 961ec9b358..19534f5000 100644 --- a/src/sas/qtgui/MainWindow/GuiManager.py +++ b/src/sas/qtgui/MainWindow/GuiManager.py @@ -704,6 +704,7 @@ def addTriggers(self): # File self._workspace.actionLoadData.triggered.connect(self.actionLoadData) self._workspace.actionLoad_Data_Folder.triggered.connect(self.actionLoad_Data_Folder) + self._workspace.actionLoad_SASBDB.triggered.connect(self.actionLoad_SASBDB) self._workspace.actionOpen_Project.triggered.connect(self.actionOpen_Project) self._workspace.actionOpen_Analysis.triggered.connect(self.actionOpen_Analysis) self._workspace.actionSave.triggered.connect(self.actionSave_Project) @@ -801,6 +802,238 @@ def actionLoad_Data_Folder(self): """ self.filesWidget.loadFolder() + def actionLoad_SASBDB(self): + """ + Menu File/Load from SASBDB + + Opens a dialog to download and load a dataset from SASBDB. + """ + from sas.qtgui.Utilities.SASBDB.SASBDBDownloadDialog import SASBDBDownloadDialog + + dialog = SASBDBDownloadDialog(parent=self._workspace) + if dialog.exec(): + # Get the downloaded file path and metadata + filepath = dialog.getDownloadedFilepath() + dataset_info = dialog.getDatasetInfo() + + if filepath and os.path.exists(filepath): + try: + # Load the downloaded file into SasView + # readData returns (output_dict, message) + loaded_data, load_message = self.filesWidget.readData([filepath]) + + # Populate additional metadata from SASBDB into loaded data + if dataset_info and loaded_data: + self._populateSASBDBMetadata(loaded_data, dataset_info) + + # Log metadata summary + if dataset_info: + entry_id = dataset_info.code or dataset_info.entry_id + logger.info(f"Successfully loaded SASBDB dataset {entry_id} from {filepath}") + if dataset_info.title: + logger.info(f" Title: {dataset_info.title}") + if dataset_info.rg is not None: + logger.info(f" Rg: {dataset_info.rg:.2f} Å") + if dataset_info.molecular_weight is not None: + logger.info(f" MW: {dataset_info.molecular_weight:.1f} kDa") + else: + logger.info(f"Successfully loaded SASBDB dataset from {filepath}") + + except Exception as e: + logger.error(f"Error loading downloaded SASBDB dataset: {e}", exc_info=True) + QMessageBox.warning( + self._workspace, + "Load Error", + f"Failed to load downloaded dataset:\n{str(e)}" + ) + + def _populateSASBDBMetadata(self, loaded_data: dict, dataset_info): + """ + Populate SASBDB metadata into loaded data objects. + + Updates the sample, source, and other metadata properties of loaded + data with information from SASBDB. + + :param loaded_data: Dictionary of loaded data objects (keyed by id) + :param dataset_info: SASBDBDatasetInfo object with parsed metadata + """ + from sasdata.dataloader.data_info import Sample, Source + + for data_id, data in loaded_data.items(): + try: + # Debug: log what we have in dataset_info + logger.debug(f"Populating metadata for {data_id}: molecule_name={dataset_info.molecule_name}, " + f"sample_name={dataset_info.sample_name}, temperature={dataset_info.temperature}, " + f"concentration={dataset_info.concentration}, buffer={dataset_info.buffer_description}") + + # Update title + if dataset_info.title and not data.title: + data.title = dataset_info.title + + # Update instrument from SASBDB instrument metadata + if dataset_info.instrument and not data.instrument: + data.instrument = dataset_info.instrument + + # Update run info with SASBDB code + if dataset_info.code: + if not data.run: + data.run = [] + if dataset_info.code not in data.run: + data.run.append(dataset_info.code) + + # Populate sample information from Molecule metadata + if data.sample is None: + data.sample = Sample() + + # Ensure sample.details is initialized + if not hasattr(data.sample, 'details') or data.sample.details is None: + data.sample.details = [] + + # Map molecule short name into Sample ID when available. + sample_id = "" + if dataset_info.molecule_short_name: + sample_id = dataset_info.molecule_short_name + elif dataset_info.sample_name: + # Fallback for entries without molecule short name. + sample_id = dataset_info.sample_name + elif dataset_info.code: + sample_id = dataset_info.code + if sample_id: + data.sample.ID = sample_id + logger.debug(f"Set sample.ID to: {sample_id}") + + # Build human-readable sample details from remaining metadata. + sample_details = [] + if dataset_info.molecule_name: + molecule_str = f"Molecule: {dataset_info.molecule_name}" + if dataset_info.molecule_type: + molecule_str += f" ({dataset_info.molecule_type})" + sample_details.append(molecule_str) + elif dataset_info.sample_name: + sample_details.append(f"Sample: {dataset_info.sample_name}") + + if dataset_info.sample_description: + sample_details.append( + f"Description: {dataset_info.sample_description}" + ) + + if dataset_info.sequence: + sample_details.append(f"Sequence: {dataset_info.sequence}") + + if dataset_info.uniprot_code: + sample_details.append(f"UniProt: {dataset_info.uniprot_code}") + + oligomerization = ( + dataset_info.oligomerization or dataset_info.oligomeric_state + ) + if oligomerization: + sample_details.append(f"Oligomerization: {oligomerization}") + + if dataset_info.number_of_molecules: + sample_details.append( + f"Number of molecules: {dataset_info.number_of_molecules}" + ) + + if dataset_info.source_organism: + sample_details.append( + f"Source organism: {dataset_info.source_organism}" + ) + + # Set temperature + if dataset_info.temperature is not None: + data.sample.temperature = dataset_info.temperature + if dataset_info.temperature_unit: + data.sample.temperature_unit = dataset_info.temperature_unit + logger.debug(f"Set sample.temperature to: {dataset_info.temperature} {dataset_info.temperature_unit}") + temp_unit = dataset_info.temperature_unit or "" + temp_str = f"Temperature: {dataset_info.temperature}" + if temp_unit: + temp_str += f" {temp_unit}" + sample_details.append(temp_str) + + # Store concentration in sample details + if dataset_info.concentration is not None: + conc_str = f"Concentration: {dataset_info.concentration}" + if dataset_info.concentration_unit: + conc_str += f" {dataset_info.concentration_unit}" + sample_details.append(conc_str) + + # Add buffer info to sample details + if dataset_info.buffer_description: + buffer_str = f"Buffer: {dataset_info.buffer_description}" + if dataset_info.ph is not None: + buffer_str += f" (pH {dataset_info.ph})" + sample_details.append(buffer_str) + + for detail in sample_details: + if detail and detail not in data.sample.details: + data.sample.details.append(detail) + logger.debug(f"Added to sample.details: {detail}") + + # Log sample info for debugging + logger.debug(f"Sample populated for {data_id}: name={getattr(data.sample, 'name', None)}, " + f"temperature={getattr(data.sample, 'temperature', None)}, " + f"details={getattr(data.sample, 'details', [])}") + + # Populate source information + if data.source is None: + data.source = Source() + + if dataset_info.wavelength is not None: + data.source.wavelength = dataset_info.wavelength + data.source.wavelength_unit = dataset_info.wavelength_unit + + # Store additional SASBDB metadata in meta_data dictionary + if not hasattr(data, 'meta_data') or data.meta_data is None: + data.meta_data = {} + + # SASBDB-specific metadata + data.meta_data['SASBDB_code'] = dataset_info.code or dataset_info.entry_id + + if dataset_info.rg is not None: + data.meta_data['SASBDB_Rg'] = dataset_info.rg + if dataset_info.rg_error is not None: + data.meta_data['SASBDB_Rg_error'] = dataset_info.rg_error + + if dataset_info.i0 is not None: + data.meta_data['SASBDB_I0'] = dataset_info.i0 + if dataset_info.i0_error is not None: + data.meta_data['SASBDB_I0_error'] = dataset_info.i0_error + + if dataset_info.dmax is not None: + data.meta_data['SASBDB_Dmax'] = dataset_info.dmax + + if dataset_info.molecular_weight is not None: + data.meta_data['SASBDB_MW'] = dataset_info.molecular_weight + if dataset_info.molecular_weight_method: + data.meta_data['SASBDB_MW_method'] = dataset_info.molecular_weight_method + + if dataset_info.porod_volume is not None: + data.meta_data['SASBDB_Porod_volume'] = dataset_info.porod_volume + + if dataset_info.molecule_name: + data.meta_data['SASBDB_molecule'] = dataset_info.molecule_name + + if dataset_info.molecule_type: + data.meta_data['SASBDB_molecule_type'] = dataset_info.molecule_type + + if dataset_info.oligomeric_state: + data.meta_data['SASBDB_oligomeric_state'] = dataset_info.oligomeric_state + + if dataset_info.publication_doi: + data.meta_data['SASBDB_DOI'] = dataset_info.publication_doi + + if dataset_info.publication_pmid: + data.meta_data['SASBDB_PMID'] = dataset_info.publication_pmid + + if dataset_info.authors: + data.meta_data['SASBDB_authors'] = ', '.join(dataset_info.authors) + + logger.debug(f"Populated SASBDB metadata for data: {data_id}") + + except Exception as e: + logger.warning(f"Error populating metadata for data {data_id}: {e}") + def actionOpen_Project(self): """ Menu Open Project diff --git a/src/sas/qtgui/MainWindow/UI/MainWindowUI.ui b/src/sas/qtgui/MainWindow/UI/MainWindowUI.ui index 65a8656e34..61aee0e236 100755 --- a/src/sas/qtgui/MainWindow/UI/MainWindowUI.ui +++ b/src/sas/qtgui/MainWindow/UI/MainWindowUI.ui @@ -33,6 +33,7 @@ + @@ -310,6 +311,18 @@ Load Data Folder + + + + :/res/file_send-128.png:/res/file_send-128.png + + + Load from SASBDB... + + + Download and load dataset from SASBDB + + Open Project diff --git a/src/sas/qtgui/Utilities/GuiUtils.py b/src/sas/qtgui/Utilities/GuiUtils.py index c7bebbbceb..5dfbfd4c5a 100644 --- a/src/sas/qtgui/Utilities/GuiUtils.py +++ b/src/sas/qtgui/Utilities/GuiUtils.py @@ -533,6 +533,314 @@ def openLink(url): raise AttributeError(msg) +def _formatDictValue(value, max_depth=2) -> str: + """ + Format a dictionary value for display, extracting useful information. + + :param value: Dictionary or other value to format + :param max_depth: Maximum nesting depth to process + :return: Formatted string representation + """ + if not isinstance(value, dict): + return str(value) + + if max_depth <= 0: + return "{...}" + + # Extract common useful fields from dictionaries + parts = [] + + # Instrument/source dictionaries + if 'name' in value: + parts.append(value['name']) + if 'beamline_name' in value: + parts.append(f"Beamline: {value['beamline_name']}") + if 'type' in value: + parts.append(f"Type: {value['type']}") + if 'city' in value and 'country' in value: + parts.append(f"{value['city']}, {value['country']}") + elif 'city' in value: + parts.append(value['city']) + elif 'country' in value: + parts.append(value['country']) + + # Detector dictionaries + if 'detector' in str(value).lower() or 'type' in value: + if 'name' in value: + parts.append(f"Detector: {value['name']}") + if 'type' in value and 'name' not in value: + parts.append(f"Detector: {value['type']}") + if 'resolution' in value: + parts.append(f"Resolution: {value['resolution']}") + + if parts: + return " | ".join(parts) + + # Fallback: show key-value pairs for shallow dicts + if len(value) <= 3: + return ", ".join(f"{k}: {v}" for k, v in value.items() if not isinstance(v, dict)) + + return "{...}" + + +def _formatDictLine(line: str) -> str: + """ + Extract and format dictionary information from a line. + + :param line: Line that may contain a dictionary representation + :return: Formatted line with dictionary info extracted, or original line if no dict found + """ + import ast + import re + + stripped = line.strip() + + # Check if line contains a dictionary + if not ("{" in stripped and "'" in stripped and ":" in stripped): + return line + + # Try to extract the label (e.g., "Instrument:", "Detector:") + label_match = re.match(r'^(\w+)\s*:\s*(.+)', stripped) + if label_match: + label = label_match.group(1) + dict_str = label_match.group(2) + elif stripped.startswith("{'") or stripped.startswith("{"): + label = None + dict_str = stripped + else: + return line + + # Try to parse the dictionary string + try: + # Use ast.literal_eval to safely parse the dictionary + parsed_dict = ast.literal_eval(dict_str) + if not isinstance(parsed_dict, dict): + return line + + # Format the dictionary + parts = [] + + # Extract instrument/source info + if 'name' in parsed_dict and parsed_dict['name']: + parts.append(parsed_dict['name']) + if 'beamline_name' in parsed_dict and parsed_dict['beamline_name']: + parts.append(f"Beamline: {parsed_dict['beamline_name']}") + if 'type_of_source' in parsed_dict and parsed_dict['type_of_source']: + parts.append(f"Source: {parsed_dict['type_of_source']}") + if 'city' in parsed_dict and parsed_dict['city']: + city = parsed_dict['city'] + if 'country' in parsed_dict and parsed_dict['country']: + parts.append(f"{city}, {parsed_dict['country']}") + else: + parts.append(city) + elif 'country' in parsed_dict and parsed_dict['country']: + parts.append(parsed_dict['country']) + + # Extract detector info (nested or direct) + detector_info = None + if 'detector' in parsed_dict and isinstance(parsed_dict['detector'], dict): + det = parsed_dict['detector'] + if 'name' in det and det['name']: + detector_info = det['name'] + elif 'type' in det and det['type']: + detector_info = det['type'] + elif 'detector' in parsed_dict: + detector_info = str(parsed_dict['detector']) + + if detector_info: + parts.append(f"Detector: {detector_info}") + + # If we extracted useful info, format it + if parts: + formatted = " | ".join(parts) + if label: + return f"{label}: {formatted}" + else: + return formatted + + # Fallback: show key-value pairs for simple dicts + simple_parts = [] + for k, v in parsed_dict.items(): + if v is not None and not isinstance(v, dict): + simple_parts.append(f"{k}: {v}") + if simple_parts and len(simple_parts) <= 5: + formatted = " | ".join(simple_parts) + if label: + return f"{label}: {formatted}" + else: + return formatted + + except (ValueError, SyntaxError): + # If parsing fails, return original line + pass + + return line + + +def _formatSASBDBMetadata(data) -> str: + """ + Format SASBDB metadata from data object for display. + + Displays instrument, sample, source info and meta_data dictionary + for datasets loaded from SASBDB in a clean, concise format. + + :param data: Data1D or Data2D object + :return: Formatted string with SASBDB metadata, or empty string if none + """ + text = "" + + # Check if meta_data exists and contains SASBDB info + meta = getattr(data, 'meta_data', None) or {} + + # Check if this is SASBDB data + if 'SASBDB_code' not in meta: + return text + + text += "\n" + "=" * 50 + "\n" + text += "SASBDB Metadata\n" + text += "=" * 50 + "\n" + + # Entry and instrument info (compact header) + header_parts = [] + if meta.get('SASBDB_code'): + header_parts.append(f"Code: {meta['SASBDB_code']}") + + # Extract instrument info from metadata (handle both string and dict formats) + instrument_str = None + location_str = None + + if hasattr(data, 'instrument') and data.instrument: + if isinstance(data.instrument, str): + instrument_str = data.instrument + elif isinstance(data.instrument, dict): + # Extract from dict + instrument_str = data.instrument.get('name') or data.instrument.get('beamline_name') + if data.instrument.get('city') and data.instrument.get('country'): + location_str = f"{data.instrument['city']}, {data.instrument['country']}" + elif data.instrument.get('city'): + location_str = data.instrument['city'] + + # Also check meta_data for instrument info (search all keys for dict values) + if not instrument_str: + for key, val in meta.items(): + if isinstance(val, dict): + # Check if this looks like an instrument dict + if 'beamline_name' in val or 'name' in val or 'type_of_source' in val: + instrument_str = val.get('beamline_name') or val.get('name') + if val.get('city') and val.get('country'): + location_str = f"{val['city']}, {val['country']}" + elif val.get('city'): + location_str = val['city'] + break + elif key in ['instrument', 'source', 'beamline'] and isinstance(val, str): + instrument_str = val + break + + if instrument_str: + header_parts.append(f"Instrument: {instrument_str}") + if location_str: + header_parts.append(location_str) + + # Extract detector info if available (search all keys for detector dicts) + detector_info = None + for key, val in meta.items(): + if isinstance(val, dict) and ('detector' in key.lower() or 'type' in val): + detector_name = val.get('name') or val.get('type') + if detector_name: + detector_info = detector_name + break + elif 'detector' in key.lower() and isinstance(val, str): + detector_info = val + break + + if detector_info: + header_parts.append(f"Detector: {detector_info}") + + if header_parts: + text += " | ".join(header_parts) + "\n" + + # Sample info (consolidated) + if hasattr(data, 'sample') and data.sample: + sample = data.sample + sample_parts = [] + + if hasattr(sample, 'name') and sample.name: + sample_parts.append(sample.name) + + # Add temperature if available + if hasattr(sample, 'temperature') and sample.temperature is not None: + temp_unit = getattr(sample, 'temperature_unit', 'K') or 'K' + sample_parts.append(f"T = {sample.temperature} {temp_unit}") + + # Add concentration and buffer from details (if present) + if hasattr(sample, 'details') and sample.details: + for detail in sample.details: + if detail.startswith("Concentration:"): + sample_parts.append(detail.replace("Concentration: ", "")) + elif detail.startswith("Buffer:"): + sample_parts.append(detail.replace("Buffer: ", "")) + + if sample_parts: + text += f"Sample: {' | '.join(sample_parts)}\n" + + # Source wavelength (if available) + if hasattr(data, 'source') and data.source: + source = data.source + if hasattr(source, 'wavelength') and source.wavelength is not None: + wl_unit = getattr(source, 'wavelength_unit', 'Å') or 'Å' + text += f"Wavelength: {source.wavelength} {wl_unit}\n" + + # Structural parameters (compact format) + structural_parts = [] + if meta.get('SASBDB_Rg') is not None: + rg_str = f"Rg = {meta['SASBDB_Rg']:.2f}" + if meta.get('SASBDB_Rg_error') is not None: + rg_str += f" ± {meta['SASBDB_Rg_error']:.2f}" + rg_str += " Å" + structural_parts.append(rg_str) + + if meta.get('SASBDB_I0') is not None: + i0_str = f"I(0) = {meta['SASBDB_I0']:.4e}" + if meta.get('SASBDB_I0_error') is not None: + i0_str += f" ± {meta['SASBDB_I0_error']:.4e}" + structural_parts.append(i0_str) + + if meta.get('SASBDB_Dmax') is not None: + structural_parts.append(f"Dmax = {meta['SASBDB_Dmax']:.2f} Å") + + if meta.get('SASBDB_MW') is not None: + mw_str = f"MW = {meta['SASBDB_MW']:.2f} kDa" + if meta.get('SASBDB_MW_method'): + mw_str += f" ({meta['SASBDB_MW_method']})" + structural_parts.append(mw_str) + + if meta.get('SASBDB_Porod_volume') is not None: + structural_parts.append(f"Porod Volume = {meta['SASBDB_Porod_volume']:.0f} ų") + + if structural_parts: + text += "Structural: " + " | ".join(structural_parts) + "\n" + + # Publication info (compact, only if available) + pub_parts = [] + if meta.get('SASBDB_authors'): + # Truncate authors if too long + authors = meta['SASBDB_authors'] + if len(authors) > 50: + authors = authors[:47] + "..." + pub_parts.append(f"Authors: {authors}") + if meta.get('SASBDB_DOI'): + pub_parts.append(f"DOI: {meta['SASBDB_DOI']}") + if meta.get('SASBDB_PMID'): + pub_parts.append(f"PMID: {meta['SASBDB_PMID']}") + + if pub_parts: + text += "Publication: " + " | ".join(pub_parts) + "\n" + + text += "=" * 50 + "\n\n" + + return text + + def retrieveData1d(data): """ Retrieve 1D data from file and construct its text @@ -551,6 +859,35 @@ def retrieveData1d(data): raise ValueError(msg) text = data.__str__() + + # Format dictionary representations instead of removing them + import re + lines = text.split('\n') + cleaned_lines = [] + for line in lines: + stripped = line.strip() + # Check if line contains a dictionary representation + if ("{" in stripped and "'" in stripped and ":" in stripped and + (stripped.startswith("{'") or + re.search(r':\s*\{', stripped) or # Matches "Instrument: {" + re.search(r"\{'[^']+':", stripped))): # Matches "{'key':" + # Count opening and closing braces to detect dict structures + open_braces = stripped.count('{') + close_braces = stripped.count('}') + # If it looks like a dictionary, format it instead of skipping + if open_braces > 0 and close_braces > 0: + formatted_line = _formatDictLine(line) + if formatted_line != line: # Only add if it was successfully formatted + cleaned_lines.append(formatted_line) + else: + cleaned_lines.append(line) + else: + cleaned_lines.append(line) + text = '\n'.join(cleaned_lines) + + # Add SASBDB metadata if present + text += _formatSASBDBMetadata(data) + text += 'Data Min Max:\n' text += 'X_min = %s: X_max = %s\n' % (xmin, max(data.x)) text += 'Y_min = %s: Y_max = %s\n' % (ymin, max(data.y)) @@ -595,6 +932,35 @@ def retrieveData2d(data): raise AttributeError(msg) text = data.__str__() + + # Format dictionary representations instead of removing them + import re + lines = text.split('\n') + cleaned_lines = [] + for line in lines: + stripped = line.strip() + # Check if line contains a dictionary representation + if ("{" in stripped and "'" in stripped and ":" in stripped and + (stripped.startswith("{'") or + re.search(r':\s*\{', stripped) or # Matches "Instrument: {" + re.search(r"\{'[^']+':", stripped))): # Matches "{'key':" + # Count opening and closing braces to detect dict structures + open_braces = stripped.count('{') + close_braces = stripped.count('}') + # If it looks like a dictionary, format it instead of skipping + if open_braces > 0 and close_braces > 0: + formatted_line = _formatDictLine(line) + if formatted_line != line: # Only add if it was successfully formatted + cleaned_lines.append(formatted_line) + else: + cleaned_lines.append(line) + else: + cleaned_lines.append(line) + text = '\n'.join(cleaned_lines) + + # Add SASBDB metadata if present + text += _formatSASBDBMetadata(data) + text += 'Data Min Max:\n' text += 'I_min = %s\n' % min(data.data) text += 'I_max = %s\n\n' % max(data.data) diff --git a/src/sas/qtgui/Utilities/SASBDB/SASBDBDownloadDialog.py b/src/sas/qtgui/Utilities/SASBDB/SASBDBDownloadDialog.py new file mode 100644 index 0000000000..87250f6dde --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/SASBDBDownloadDialog.py @@ -0,0 +1,222 @@ +""" +SASBDB Dataset Download Dialog. + +Provides a dialog interface for downloading datasets from SASBDB +and loading them into SasView. +""" + +import logging +import os +import tempfile + +from PySide6 import QtWidgets + +from .sasbdb_api import SASBDBDatasetInfo, downloadDataset +from .UI.SASBDBDownloadDialogUI import Ui_SASBDBDownloadDialogUI + +logger = logging.getLogger(__name__) + + +class SASBDBDownloadDialog(QtWidgets.QDialog, Ui_SASBDBDownloadDialogUI): + """ + Dialog for downloading datasets from SASBDB. + + Allows users to enter a SASBDB dataset identifier, downloads + the dataset, and loads it into SasView. + """ + + def __init__(self, parent=None): + """ + Initialize the download dialog. + + :param parent: Parent widget + """ + super().__init__(parent) + self.setupUi(self) + + # Store downloaded file path and metadata + self.downloaded_filepath: str | None = None + self.dataset_info: SASBDBDatasetInfo | None = None + + # Connect signals + self.cmdDownload.clicked.connect(self.onDownload) + self.cmdCancel.clicked.connect(self.reject) + self.cmdHelp.clicked.connect(self.onHelp) + self.txtDatasetId.returnPressed.connect(self.onDownload) + + # Set focus on dataset ID input + self.txtDatasetId.setFocus() + + def onDownload(self): + """ + Handle download button click. + + Validates the dataset ID, downloads the dataset, + and loads it into SasView. + """ + dataset_id = self.txtDatasetId.text().strip() + + if not dataset_id: + self._showError("Please enter a dataset identifier.") + return + + # Disable download button during download + self.cmdDownload.setEnabled(False) + self.cmdCancel.setEnabled(False) + self.progressBar.setVisible(True) + self.progressBar.setRange(0, 0) # Indeterminate progress + self.lblStatus.setText("Downloading dataset...") + QtWidgets.QApplication.processEvents() + + try: + # Download the dataset + output_dir = tempfile.gettempdir() + filepath, dataset_info = downloadDataset(dataset_id, output_dir) + + # Store the metadata + self.dataset_info = dataset_info + + if filepath and os.path.exists(filepath): + self.downloaded_filepath = filepath + + # Build success message with metadata summary + success_msg = f"Successfully downloaded dataset {dataset_id}" + if dataset_info: + details = self._formatMetadataSummary(dataset_info) + if details: + success_msg += f"\n{details}" + + self.lblStatus.setText(success_msg) + self.progressBar.setVisible(False) + + # Log detailed metadata + if dataset_info: + self._logMetadata(dataset_info) + + self.accept() # Close dialog and return success + else: + self._showError(f"Failed to download dataset {dataset_id}.\n" + "Please check the dataset identifier and try again.") + self.progressBar.setVisible(False) + self.cmdDownload.setEnabled(True) + self.cmdCancel.setEnabled(True) + + except Exception as e: + logger.error(f"Error downloading dataset {dataset_id}: {e}", exc_info=True) + self._showError(f"Error downloading dataset:\n{str(e)}") + self.progressBar.setVisible(False) + self.cmdDownload.setEnabled(True) + self.cmdCancel.setEnabled(True) + + def _formatMetadataSummary(self, info: SASBDBDatasetInfo) -> str: + """ + Format a brief summary of the dataset metadata. + + :param info: Parsed dataset info + :return: Formatted summary string + """ + parts = [] + + if info.title: + parts.append(f"Title: {info.title}") + if info.sample_name: + parts.append(f"Sample: {info.sample_name}") + if info.rg is not None: + rg_str = f"Rg: {info.rg:.2f}" + if info.rg_error: + rg_str += f" ± {info.rg_error:.2f}" + rg_str += " Å" + parts.append(rg_str) + if info.molecular_weight is not None: + parts.append(f"MW: {info.molecular_weight:.1f} kDa") + + return "\n".join(parts) + + def _logMetadata(self, info: SASBDBDatasetInfo): + """ + Log detailed metadata information. + + :param info: Parsed dataset info + """ + logger.info(f"SASBDB Dataset: {info.code or info.entry_id}") + if info.title: + logger.info(f" Title: {info.title}") + if info.sample_name: + logger.info(f" Sample: {info.sample_name}") + if info.molecule_name: + logger.info(f" Molecule: {info.molecule_name}") + if info.concentration: + logger.info(f" Concentration: {info.concentration} {info.concentration_unit}") + if info.buffer_description: + logger.info(f" Buffer: {info.buffer_description}") + if info.instrument: + logger.info(f" Instrument: {info.instrument}") + if info.wavelength: + logger.info(f" Wavelength: {info.wavelength} {info.wavelength_unit}") + if info.temperature: + logger.info(f" Temperature: {info.temperature} {info.temperature_unit}") + if info.rg is not None: + logger.info(f" Rg: {info.rg} ± {info.rg_error or 0} Å") + if info.i0 is not None: + logger.info(f" I(0): {info.i0} ± {info.i0_error or 0}") + if info.dmax is not None: + logger.info(f" Dmax: {info.dmax} Å") + if info.molecular_weight is not None: + logger.info(f" MW: {info.molecular_weight} kDa") + if info.publication_doi: + logger.info(f" DOI: {info.publication_doi}") + + def _showError(self, message: str): + """ + Display an error message to the user. + + :param message: Error message to display + """ + self.lblStatus.setText(f"{message}") + QtWidgets.QMessageBox.warning(self, "Download Error", message) + + def getDownloadedFilepath(self) -> str | None: + """ + Get the path to the downloaded file. + + :return: Path to downloaded file, or None if download failed + """ + return self.downloaded_filepath + + def getDatasetInfo(self) -> SASBDBDatasetInfo | None: + """ + Get the parsed dataset metadata. + + :return: SASBDBDatasetInfo object, or None if metadata not available + """ + return self.dataset_info + + def onHelp(self): + """ + Show the SASBDB download help documentation. + """ + from sas.qtgui.Utilities import GuiUtils + help_location = "user/qtgui/Utilities/SASBDB/sasbdb_download_help.html" + try: + # Try to get guiManager from parent workspace + parent = self.parent() + if parent: + # Check if parent has guiManager attribute + if hasattr(parent, 'guiManager') and hasattr(parent.guiManager, 'showHelp'): + parent.guiManager.showHelp(help_location) + return + # Check if parent itself has showHelp + elif hasattr(parent, 'showHelp'): + parent.showHelp(help_location) + return + + # Fallback to GuiUtils + GuiUtils.showHelp(help_location) + except Exception as e: + logger.warning(f"Could not display help: {e}") + # Final fallback to GuiUtils + try: + GuiUtils.showHelp(help_location) + except Exception as e2: + logger.error(f"Failed to display help: {e2}") + diff --git a/src/sas/qtgui/Utilities/SASBDB/UI/SASBDBDownloadDialogUI.ui b/src/sas/qtgui/Utilities/SASBDB/UI/SASBDBDownloadDialogUI.ui new file mode 100644 index 0000000000..8dd7f45f81 --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/UI/SASBDBDownloadDialogUI.ui @@ -0,0 +1,123 @@ + + + SASBDBDownloadDialogUI + + + + 0 + 0 + 450 + 200 + + + + Load from SASBDB + + + + + + Enter SASBDB Dataset Identifier: + + + + + + + e.g., 1234 or SASDB1234 + + + + + + + + + + true + + + + + + + 0 + + + 0 + + + 0 + + + false + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + + + + + + Qt::Horizontal + + + + 40 + 20 + + + + + + + + Download and Load + + + true + + + + + + + Help + + + + + + + Cancel + + + + + + + + + + + cmdCancel + clicked() + SASBDBDownloadDialogUI + reject() + + + + diff --git a/src/sas/qtgui/Utilities/SASBDB/UI/__init__.py b/src/sas/qtgui/Utilities/SASBDB/UI/__init__.py new file mode 100644 index 0000000000..88400a3f7c --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/UI/__init__.py @@ -0,0 +1,4 @@ +""" +UI components for SASBDB utilities. +""" + diff --git a/src/sas/qtgui/Utilities/SASBDB/__init__.py b/src/sas/qtgui/Utilities/SASBDB/__init__.py new file mode 100644 index 0000000000..5ce0e3c888 --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/__init__.py @@ -0,0 +1,7 @@ +""" +SASBDB (Small Angle Scattering Biological Data Bank) utilities. + +This package provides functionality for interacting with SASBDB, +including downloading datasets and exporting data to SASBDB format. +""" + diff --git a/src/sas/qtgui/Utilities/SASBDB/media/sasbdb_download_help.rst b/src/sas/qtgui/Utilities/SASBDB/media/sasbdb_download_help.rst new file mode 100644 index 0000000000..00d7a30d43 --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/media/sasbdb_download_help.rst @@ -0,0 +1,141 @@ +.. sasbdb_download_help.rst + +.. _SASBDB_Download: + +Loading Datasets from SASBDB +============================= + +Description +----------- + +The SASBDB (Small Angle Scattering Biological Data Bank) download feature allows you to directly download and load datasets from the SASBDB database into SasView. This feature provides easy access to published small-angle scattering data and automatically populates metadata from the SASBDB entry. + +Accessing the Feature +--------------------- + +To access the SASBDB download feature: + +1. Select **File > Load from SASBDB...** from the main menu +2. A dialog will appear prompting you to enter a SASBDB dataset identifier + +Dataset Identifier +------------------ + +Enter a valid SASBDB dataset identifier in the input field. The identifier can be: + +- A 7-character SASBDB code (e.g., ``SASDN24``, ``SASDB1234``) +- The identifier is case-insensitive and will be automatically normalized + +Examples of valid identifiers: +- ``SASDN24`` +- ``sasdn24`` (will be converted to uppercase) + +Download Process +---------------- + +When you click **Download and Load**: + +1. **Validation**: The dataset identifier is validated +2. **Metadata Retrieval**: The system fetches metadata from the SASBDB REST API +3. **Data Download**: The experimental data file is downloaded to a temporary location +4. **Data Loading**: The downloaded data is automatically loaded into SasView +5. **Metadata Population**: Metadata from SASBDB is automatically populated into the loaded dataset + +Progress Indicator +------------------ + +During the download process, you will see: + +- A progress bar indicating the download is in progress +- Status messages showing the current operation +- A success message with a summary of the loaded dataset + +Metadata Population +-------------------- + +When a dataset is loaded from SASBDB, the following metadata is automatically extracted and populated: + +**Sample Information:** +- Sample ID (from molecule short name, if available) +- Sample details section populated with: + - Sequence (when available) + - Molecule/sample description + - UniProt code (when available) + - Oligomerization and number of molecules (when available) + - Source organism (when available) + - Temperature + - Concentration + - Buffer description and pH + +**Instrument Information:** +- Instrument/beamline name +- Detector information +- Location (city, country) +- Source type (X-ray synchrotron, neutron, etc.) + +**Experimental Parameters:** +- Wavelength +- Temperature +- Q-range (if available) + +**Structural Parameters:** +- Radius of gyration (Rg) with errors +- I(0) with errors +- Maximum dimension (Dmax) +- Molecular weight (MW) with method +- Porod volume + +**Publication Information:** +- Authors +- DOI +- PMID + +Viewing Metadata +---------------- + +After loading a SASBDB dataset, you can view the populated metadata by: + +1. Right-clicking on the loaded dataset in the Data Explorer +2. Selecting **Data Info** from the context menu +3. The metadata will be displayed in a clean, formatted section labeled "SASBDB Metadata" + +The metadata is displayed in a compact format with key information organized by category: +- Entry code and instrument information +- Sample ID and sample details (sequence, UniProt, oligomerization, concentration, buffer, etc.) +- Source wavelength +- Structural parameters (Rg, I(0), Dmax, MW, etc.) +- Publication information + +Error Handling +-------------- + +If an error occurs during download: + +- An error message will be displayed explaining the issue +- Common errors include: + - Invalid dataset identifier format + - Dataset not found in SASBDB + - Network connection issues + - API service unavailable + +If you encounter errors: + +1. Verify the dataset identifier is correct +2. Check your internet connection +3. Try again after a few moments if the SASBDB service is temporarily unavailable + +Tips +---- + +- **Dataset Identifiers**: You can find SASBDB dataset identifiers in published papers or on the SASBDB website (https://www.sasbdb.org) +- **Metadata**: All metadata is automatically extracted from the SASBDB entry, so you don't need to manually enter it +- **Data Format**: The downloaded data is in a standard format compatible with SasView +- **Offline Use**: Once downloaded, the data file is stored locally and can be used offline + +Related Documentation +--------------------- + +- :ref:`SASBDB Export ` - Export your data to SASBDB format +- `SASBDB Website `_ - Browse and search the SASBDB database +- `SASBDB REST API Documentation `_ - Technical API reference + diff --git a/src/sas/qtgui/Utilities/SASBDB/sasbdb_api.py b/src/sas/qtgui/Utilities/SASBDB/sasbdb_api.py new file mode 100644 index 0000000000..7c4b0f5b79 --- /dev/null +++ b/src/sas/qtgui/Utilities/SASBDB/sasbdb_api.py @@ -0,0 +1,661 @@ +""" +SASBDB REST API client module. + +Provides functions to interact with the SASBDB REST API for downloading +datasets and retrieving metadata. +""" + +import logging +import os +import tempfile +from dataclasses import dataclass, field + +import requests + +logger = logging.getLogger(__name__) + +# Base API URL - SASBDB REST API +SASBDB_API_BASE = "https://www.sasbdb.org/rest-api" + + +@dataclass +class SASBDBDatasetInfo: + """ + Parsed metadata from a SASBDB dataset entry. + + Contains key information extracted from the SASBDB REST API response. + """ + # Identifiers + entry_id: str = "" + code: str = "" + title: str = "" + + # Sample information + sample_name: str = "" + sample_description: str = "" + concentration: float | None = None + concentration_unit: str = "mg/mL" + + # Molecule information + molecule_name: str = "" + molecule_short_name: str = "" + molecule_type: str = "" + sequence: str = "" + uniprot_code: str = "" + source_organism: str = "" + number_of_molecules: str = "" + oligomerization: str = "" + molecular_weight: float | None = None # Experimental MW in kDa + molecular_weight_method: str = "" + oligomeric_state: str = "" + + # Buffer information + buffer_description: str = "" + ph: float | None = None + + # Experimental parameters + instrument: str = "" + detector: str = "" + wavelength: float | None = None # in Angstrom + wavelength_unit: str = "Å" + temperature: float | None = None # in Kelvin or Celsius + temperature_unit: str = "K" + + # Analysis results + rg: float | None = None # Radius of gyration in Angstrom + rg_error: float | None = None + i0: float | None = None # I(0) + i0_error: float | None = None + dmax: float | None = None # Maximum dimension in Angstrom + porod_volume: float | None = None + + # Q-range + q_min: float | None = None + q_max: float | None = None + + # Publication + publication_title: str = "" + publication_doi: str = "" + publication_pmid: str = "" + authors: list = field(default_factory=list) + + # Data files + intensities_data_url: str = "" + pddf_data_url: str = "" + + # Raw metadata for additional fields + raw_metadata: dict = field(default_factory=dict) + + +def parseMetadata(metadata: dict) -> SASBDBDatasetInfo: + """ + Parse SASBDB API response into a structured SASBDBDatasetInfo object. + + :param metadata: Raw JSON dictionary from SASBDB API + :return: Parsed SASBDBDatasetInfo object + """ + info = SASBDBDatasetInfo() + + if not isinstance(metadata, dict): + return info + + # Store raw metadata for reference + info.raw_metadata = metadata + + # Log top-level keys for debugging + logger.debug(f"SASBDB API response keys: {list(metadata.keys())}") + + # Identifiers + info.entry_id = _get_str(metadata, 'id', 'entry_id', 'sasbdb_id') + info.code = _get_str(metadata, 'code', 'accession_code', 'entry_code') + info.title = _get_str(metadata, 'title', 'entry_title', 'sample_title') + + # Sample information - try more variations + info.sample_name = _get_str(metadata, 'sample_name', 'sample', 'name', 'sample_name_full') + info.sample_description = _get_str(metadata, 'sample_description', 'description', 'sample_description_full') + info.concentration = _get_float(metadata, 'concentration', 'sample_concentration', 'conc', 'sample_conc') + info.concentration_unit = _get_str(metadata, 'concentration_unit', 'conc_unit', 'concentration_units') or "mg/mL" + + # Molecule information - try more variations + info.molecule_name = _get_str( + metadata, + 'molecule_name', + 'macromolecule_name', + 'protein_name', + 'molecule', + 'macromolecule', + 'protein', + 'name', + ) or _get_deep_str( + metadata, 'long_name', 'molecule_name', 'macromolecule_name', 'protein_name' + ) + info.molecule_short_name = _get_str( + metadata, 'short_name', 'molecule_short_name', 'shortName', 'short' + ) or _get_deep_str(metadata, 'short_name', 'molecule_short_name', 'shortName') + info.molecule_type = _get_str( + metadata, 'molecule_type', 'macromolecule_type', 'sample_type', + 'type', 'molecule_type_full' + ) or _get_deep_str( + metadata, 'molecule_type', 'macromolecule_type', 'molecular_type' + ) + info.sequence = _get_sequence(metadata) + info.uniprot_code = _get_str( + metadata, + 'uniprot_code', + 'uniprot', + 'uniprot_id', + 'uniprot_accession', + 'uniprot_ac', + ) or _get_deep_str( + metadata, 'uniprot_code', 'uniprot', 'uniprot_id', 'uniprot_accession' + ) + info.source_organism = _get_str( + metadata, 'source_organism', 'organism', 'organism_name' + ) or _get_deep_str( + metadata, 'source_organism', 'organism', 'organism_name' + ) + info.number_of_molecules = _get_str( + metadata, 'number_of_molecules', 'num_molecules', 'copy_number' + ) or _get_deep_str( + metadata, 'number_of_molecules', 'number_molecules', 'num_molecules', + 'copy_number' + ) + info.oligomerization = _get_str( + metadata, 'oligomerization', 'oligomeric_state', 'oligomer_state' + ) or _get_deep_str( + metadata, 'oligomerization', 'oligomeric_state', 'oligomer_state', + 'complex_state' + ) + info.molecular_weight = _get_float(metadata, 'molecular_weight', 'mw', 'exp_mw', + 'experimental_mw', 'mw_kda') + info.molecular_weight_method = _get_str(metadata, 'mw_method', 'molecular_weight_method') + info.oligomeric_state = info.oligomerization or _get_str( + metadata, 'oligomeric_state', 'oligomer_state', 'oligomerization' + ) + + # Buffer information + info.buffer_description = _get_str(metadata, 'buffer', 'buffer_description', 'buffer_composition') + info.ph = _get_float(metadata, 'ph', 'buffer_ph') + + # Experimental parameters + info.instrument = _get_str(metadata, 'instrument', 'beamline', 'instrument_name') + info.detector = _get_str(metadata, 'detector', 'detector_name', 'detector_type') + info.wavelength = _get_float(metadata, 'wavelength', 'xray_wavelength', 'neutron_wavelength') + info.temperature = _get_float(metadata, 'temperature', 'sample_temperature', 'temp') + + # Analysis results - Guinier + info.rg = _get_float(metadata, 'rg', 'radius_of_gyration', 'guinier_rg') + info.rg_error = _get_float(metadata, 'rg_error', 'rg_err', 'guinier_rg_error') + info.i0 = _get_float(metadata, 'i0', 'i_zero', 'guinier_i0') + info.i0_error = _get_float(metadata, 'i0_error', 'i0_err', 'guinier_i0_error') + + # Analysis results - P(r) + info.dmax = _get_float(metadata, 'dmax', 'd_max', 'maximum_dimension') + info.porod_volume = _get_float(metadata, 'porod_volume', 'volume', 'porod_vol') + + # Q-range + info.q_min = _get_float(metadata, 'q_min', 'qmin', 's_min', 'smin') + info.q_max = _get_float(metadata, 'q_max', 'qmax', 's_max', 'smax') + + # Publication + info.publication_title = _get_str(metadata, 'publication_title', 'pub_title', 'paper_title') + info.publication_doi = _get_str(metadata, 'doi', 'publication_doi', 'pub_doi') + info.publication_pmid = _get_str(metadata, 'pmid', 'publication_pmid', 'pubmed_id') + + # Authors + authors = metadata.get('authors') or metadata.get('author_list') or [] + if isinstance(authors, list): + info.authors = [str(a) for a in authors] + elif isinstance(authors, str): + info.authors = [authors] + + # Data file URLs + info.intensities_data_url = _get_str(metadata, 'intensities_data', 'data_url', 'intensities_url') + info.pddf_data_url = _get_str(metadata, 'pddf_data', 'pddf_url', 'pr_data') + + return info + + +def _get_str(data: dict, *keys: str) -> str: + """ + Get string value from dictionary, trying multiple possible keys. + Also searches in nested dictionaries (sample, molecule, experiment, etc.). + + :param data: Dictionary to search + :param keys: Possible key names to try + :return: String value or empty string if not found + """ + # First try top-level keys + for key in keys: + if key in data and data[key] is not None: + return str(data[key]) + + # Then search in common nested structures + nested_paths = ['sample', 'molecule', 'experiment', 'experimental', 'metadata', 'info'] + for path in nested_paths: + if path in data and isinstance(data[path], dict): + for key in keys: + if key in data[path] and data[path][key] is not None: + return str(data[path][key]) + + return "" + + +def _get_sequence(data: dict) -> str: + """ + Extract a protein/nucleotide sequence from nested SASBDB metadata. + + This helper searches recursively through dictionaries and lists since + sequence information may appear under molecule lists or nested blocks. + + :param data: Dictionary to search + :return: Sequence string or empty string if not found + """ + sequence_keys = { + 'sequence', + 'fasta_sequence', + 'fasta', + 'primary_sequence', + 'sequence_string', + } + + def _from_value(value) -> str: + """Normalize potential sequence values to a plain string.""" + if value is None: + return "" + if isinstance(value, str): + return value.strip() + if isinstance(value, list): + parts = [] + for item in value: + item_str = _from_value(item) + if item_str: + parts.append(item_str) + if parts: + return " ".join(parts) + return "" + if isinstance(value, dict): + # Common wrappers for sequence strings. + for nested_key in ("value", "text", "seq"): + nested = value.get(nested_key) + nested_str = _from_value(nested) + if nested_str: + return nested_str + return "" + + def _search(obj) -> str: + if isinstance(obj, dict): + for key in sequence_keys: + if key in obj: + sequence = _from_value(obj[key]) + if sequence: + return sequence + for value in obj.values(): + sequence = _search(value) + if sequence: + return sequence + elif isinstance(obj, list): + for item in obj: + sequence = _search(item) + if sequence: + return sequence + return "" + + return _search(data) + + +def _get_deep_str(data: dict, *keys: str) -> str: + """ + Recursively search nested dict/list structures for the first key match. + + :param data: Dictionary to search + :param keys: Candidate field names to locate + :return: String value or empty string if not found + """ + key_set = set(keys) + + def _normalize(value) -> str: + if value is None: + return "" + if isinstance(value, str): + return value.strip() + if isinstance(value, (int, float, bool)): + return str(value) + return "" + + def _search(obj) -> str: + if isinstance(obj, dict): + for key, value in obj.items(): + if key in key_set: + normalized = _normalize(value) + if normalized: + return normalized + for value in obj.values(): + found = _search(value) + if found: + return found + elif isinstance(obj, list): + for item in obj: + found = _search(item) + if found: + return found + return "" + + return _search(data) + + +def _get_float(data: dict, *keys: str) -> float | None: + """ + Get float value from dictionary, trying multiple possible keys. + Also searches in nested dictionaries (sample, molecule, experiment, etc.). + + :param data: Dictionary to search + :param keys: Possible key names to try + :return: Float value or None if not found + """ + # First try top-level keys + for key in keys: + if key in data and data[key] is not None: + try: + return float(data[key]) + except (ValueError, TypeError): + continue + + # Then search in common nested structures + nested_paths = ['sample', 'molecule', 'experiment', 'experimental', 'metadata', 'info'] + for path in nested_paths: + if path in data and isinstance(data[path], dict): + for key in keys: + if key in data[path] and data[path][key] is not None: + try: + return float(data[path][key]) + except (ValueError, TypeError): + continue + + return None + + +def getDatasetMetadata(dataset_id: str) -> dict | None: + """ + Fetch dataset metadata from SASBDB API. + + :param dataset_id: SASBDB dataset identifier (e.g., "SASDN24" - full 7-character ID) + :return: Dictionary containing dataset metadata, or None if error + """ + # Normalize dataset ID (uppercase, strip whitespace, ensure 7 characters) + normalized_id = _normalizeDatasetId(dataset_id) + if not normalized_id: + logger.error(f"Invalid dataset ID format: {dataset_id}") + return None + + # Use the correct REST API endpoint: /rest-api/entry/summary/{id}/ + endpoint = f"{SASBDB_API_BASE}/entry/summary/{normalized_id}/" + + try: + logger.info(f"Fetching dataset metadata from: {endpoint}") + headers = {"accept": "application/json"} + response = requests.get(endpoint, headers=headers, timeout=30) + response.raise_for_status() + + # Parse JSON response + metadata = response.json() + logger.info(f"Successfully retrieved metadata for dataset {normalized_id}") + return metadata + + except requests.exceptions.HTTPError as e: + if e.response.status_code == 404: + logger.error(f"Dataset {normalized_id} not found (404)") + else: + logger.error(f"HTTP error fetching dataset {normalized_id}: {e}") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Network error fetching dataset {normalized_id}: {e}") + return None + except ValueError as e: + logger.error(f"Invalid JSON response for dataset {normalized_id}: {e}") + return None + + +def getDataFileUrl(metadata: dict) -> str | None: + """ + Extract data file URL from dataset metadata. + + Looks for common field names in the metadata JSON that might contain + the data file URL or path. Also checks for SASBDB-specific fields. + + :param metadata: Dictionary containing dataset metadata + :return: URL string for the data file, or None if not found + """ + if not isinstance(metadata, dict): + return None + + # SASBDB-specific field names that might contain data file URLs + # Priority: intensities_data is the primary field for SASBDB + possible_fields = [ + # SASBDB primary data field + 'intensities_data', + 'intensitiesData', + # Direct URL fields + 'data_file_url', + 'dataFileUrl', + 'data_file', + 'dataFile', + 'scattering_data_url', + 'scatteringDataUrl', + 'experimental_data_url', + 'experimentalDataUrl', + 'download_url', + 'downloadUrl', + 'file_url', + 'fileUrl', + # File list fields + 'files', + 'data_files', + 'dataFiles', + 'experimental_files', + 'scattering_files', + # SASBDB API specific fields + 'experimental_data', + 'experimentalData', + 'scattering_data', + 'scatteringData', + ] + + # Check top-level fields + for field in possible_fields: + if field in metadata: + value = metadata[field] + if isinstance(value, str) and (value.startswith('http') or value.startswith('/')): + # If relative URL, make it absolute + if value.startswith('/'): + return f"https://www.sasbdb.org{value}" + return value + elif isinstance(value, list) and len(value) > 0: + # If it's a list, take the first item + first_item = value[0] + if isinstance(first_item, str): + if first_item.startswith('/'): + return f"https://www.sasbdb.org{first_item}" + elif first_item.startswith('http'): + return first_item + elif isinstance(first_item, dict): + # Check for 'url' or 'path' in the item + for url_field in ['url', 'path', 'file', 'file_url', 'download_url']: + if url_field in first_item: + url = first_item[url_field] + if isinstance(url, str): + if url.startswith('/'): + return f"https://www.sasbdb.org{url}" + elif url.startswith('http'): + return url + + # Check nested structures (common in REST APIs) + for nested_key in ['entry', 'data', 'files', 'experimental_data', 'scattering_data']: + if nested_key in metadata and isinstance(metadata[nested_key], dict): + result = getDataFileUrl(metadata[nested_key]) + if result: + return result + + # If we have an entry ID, try constructing a download URL + # Format might be: /rest-api/entry/{id}/download/ or similar + entry_id = metadata.get('entry_id') or metadata.get('id') or metadata.get('sasbdb_id') + if entry_id: + # Try common download endpoint patterns + download_endpoints = [ + f"{SASBDB_API_BASE}/entry/{entry_id}/download/", + f"{SASBDB_API_BASE}/entry/{entry_id}/data/", + f"{SASBDB_API_BASE}/entry/{entry_id}/file/", + ] + # Return first endpoint (we'll try them in downloadDataFile if needed) + return download_endpoints[0] + + logger.warning("Could not find data file URL in metadata") + logger.debug(f"Metadata keys: {list(metadata.keys())}") + return None + + +def downloadDataFile(url: str, filepath: str) -> bool: + """ + Download a data file from the given URL to the specified filepath. + + :param url: URL of the data file to download + :param filepath: Local filepath where the file should be saved + :return: True if download successful, False otherwise + """ + try: + logger.info(f"Downloading data file from: {url}") + response = requests.get(url, timeout=60, stream=True) + response.raise_for_status() + + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(filepath), exist_ok=True) + + # Write file in chunks to handle large files + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + logger.info(f"Successfully downloaded data file to: {filepath}") + return True + + except requests.exceptions.RequestException as e: + logger.error(f"Error downloading data file from {url}: {e}") + return False + except OSError as e: + logger.error(f"Error writing data file to {filepath}: {e}") + return False + + +def _normalizeDatasetId(dataset_id: str) -> str | None: + """ + Normalize a SASBDB dataset identifier. + + SASBDB identifiers are 7 characters long, typically in format: + - "SASDN24" (prefix + number) + - "SASDB1234" (if 4-digit number) + - etc. + + The API requires the full 7-character identifier. + + :param dataset_id: Input dataset identifier + :return: Normalized identifier (uppercase, stripped), or None if invalid + """ + if not dataset_id: + return None + + # Remove whitespace and convert to uppercase + normalized = dataset_id.strip().upper() + + # SASBDB identifiers are typically 7 characters + # Accept identifiers that are 4-10 characters to be flexible + if len(normalized) < 4 or len(normalized) > 10: + logger.warning(f"Dataset ID length unusual: {len(normalized)} characters for '{normalized}'") + # Still try it, but warn + + # Return the normalized identifier as-is (API expects full identifier) + return normalized + + +def downloadDataset(dataset_id: str, output_dir: str | None = None) -> tuple[str | None, SASBDBDatasetInfo | None]: + """ + Download a complete dataset from SASBDB. + + This is a convenience function that: + 1. Fetches metadata + 2. Extracts data file URL + 3. Downloads the data file + 4. Returns the local filepath and parsed metadata + + :param dataset_id: SASBDB dataset identifier + :param output_dir: Directory to save the file (defaults to temp directory) + :return: Tuple of (path to downloaded file, parsed metadata info) or (None, None) if error + """ + # Get metadata + metadata = getDatasetMetadata(dataset_id) + if not metadata: + return None, None + + # Parse metadata into structured object + dataset_info = parseMetadata(metadata) + + # Get data file URL + data_url = getDataFileUrl(metadata) + if not data_url: + logger.error(f"Could not find data file URL in metadata for dataset {dataset_id}") + return None, dataset_info # Return metadata even if download fails + + # Store the data URL in the info object + if not dataset_info.intensities_data_url: + dataset_info.intensities_data_url = data_url + + # Determine output directory + if output_dir is None: + output_dir = tempfile.gettempdir() + + # Generate filename + normalized_id = _normalizeDatasetId(dataset_id) + # Try to determine file extension from URL or metadata + file_extension = _guessFileExtension(data_url, metadata) + filename = f"SASBDB_{normalized_id}{file_extension}" + filepath = os.path.join(output_dir, filename) + + # Download the file + if downloadDataFile(data_url, filepath): + return filepath, dataset_info + + return None, dataset_info + + +def _guessFileExtension(url: str, metadata: dict) -> str: + """ + Guess the file extension from URL or metadata. + + :param url: Data file URL + :param metadata: Dataset metadata + :return: File extension (e.g., ".dat", ".txt", ".csv") + """ + # Check URL for extension + if '.' in url: + parts = url.split('.') + if len(parts) > 1: + ext = '.' + parts[-1].split('?')[0] # Remove query parameters + if ext.lower() in ['.dat', '.txt', '.csv', '.out', '.asc']: + return ext + + # Check metadata for file type hints + if isinstance(metadata, dict): + file_type_fields = ['file_type', 'fileType', 'format', 'data_format'] + for field in file_type_fields: + if field in metadata: + file_type = str(metadata[field]).lower() + if 'csv' in file_type: + return '.csv' + elif 'txt' in file_type or 'text' in file_type: + return '.txt' + elif 'dat' in file_type or 'data' in file_type: + return '.dat' + + # Default to .dat for scattering data + return '.dat' +