diff --git a/backend/README.md b/backend/README.md index 9867cbeae..82cf21cc3 100644 --- a/backend/README.md +++ b/backend/README.md @@ -10,6 +10,8 @@ The backend is intended to be run as a daemon process. See the `4cat-daemon.py` script in the parent folder for instructions on how to start and control the daemon. +Also see the [architecture diagrams](architecture.md). + ## What it does Queries a central job queue for outstanding jobs, and starts workers compatible with those jobs once any are available. diff --git a/backend/architecture.md b/backend/architecture.md new file mode 100644 index 000000000..b94800552 --- /dev/null +++ b/backend/architecture.md @@ -0,0 +1,272 @@ +# Architecture for 'backend' + +## classes + +:::mermaid +classDiagram + class BasicHTTPScraper { + category : str + log_level : str + prefix + after_process() + get_url()* + not_found() + parse(data) + process(data)* + work() + } + class BasicJSONScraper { + parse(data) + } + class BasicProcessor { + category : str + config : NoneType + dataset : NoneType + db : NoneType + description : str + extension : str + filepath : NoneType + is_hidden : bool + is_running_in_preset : bool + job : NoneType + owner : NoneType + parameters + source_dataset : NoneType + source_file : NoneType + abort() + add_field_to_parent(field_name, new_data, which_parent, update_existing) + after_process() + create_standalone() + exclude_followup_processors(processor_type) + extract_archived_file_by_name(filename, archive_path, staging_area) + get_extension(parent_dataset) + get_mapped_item(item) + get_options(parent_dataset, user) + get_status() + is_4cat_processor() + is_filter() + is_from_collector() + is_rankable(multiple_items) + is_top_dataset() + iterate_archive_contents(path, staging_area, immediately_delete, filename_filter) + map_item_method_available(dataset) + process()* + remove_files() + unpack_archive_contents(path, staging_area) + work() + write_archive_and_finish(files, num_items, compression, finish) + write_csv_items_and_finish(data) + } + class BasicWorker { + INTERRUPT_CANCEL : int + INTERRUPT_NONE : bool + INTERRUPT_RETRY : int + config : NoneType + db : Database + init_time : int + interrupted : bool + job : NoneType + log : NoneType + manager : NoneType + max_workers : int + modules : NoneType + name : str + queue : NoneType + type : str + abort()* + clean_up()* + is_4cat_class() + is_4cat_processor() + request_interrupt(level) + run() + work()* + } + class DatasetCanceller { + max_workers : int + type : str + work() + } + class DatasourceMetrics { + ensure_job : dict + max_workers : int + type : str + data_stats() + folder_size(path) + general_stats() + work() + } + class FourcatRestarterAndUpgrader { + max_workers : int + type : str + work() + } + class InternalAPI { + ensure_job : dict + host : dict + max_workers : int + port : dict + type : str + api_response(client, address) + process_request(request, payload) + work() + } + class InternalAPIException { + } + class MySQLDatabase { + connection : Connection + cursor : NoneType + log : NoneType + close() + commit() + fetchall(query) + fetchone(query) + mogrify(query, replacements) + query(query, replacements) + rollback() + } + class ProcessorPreset { + after_process() + get_processor_pipeline()* + process() + } + class QueryCanceller { + max_workers : int + type : str + work() + } + class Search { + import_error_count : int + import_warning_count : int + max_workers : int + prefix : str + return_cols : list + type : str + get_items(query)* + import_from_file(path) + items_to_archive(items, filepath) + items_to_csv(results, filepath) + items_to_ndjson(items, filepath) + process() + search(query) + } + class SearchWithScope { + fetch_posts(post_ids, where, replacements)* + fetch_threads(thread_ids)* + get_items(query)* + get_items_complex(query)* + get_items_simple(query)* + get_search_mode(query) + get_thread_sizes(thread_ids, min_length)* + search(query) + } + class TempFileCleaner { + days_to_keep : int + ensure_job : dict + max_workers : int + tracking_file + type : str + work() + } + class ThingExpirer { + ensure_job : dict + max_workers : int + type : str + expire_datasets() + expire_notifications() + expire_users() + work() + } + class UpdateChecker { + ensure_job : dict + max_workers : int + type : str + work() + } + class WorkerManager { + db : NoneType + job_mapping : dict + log : NoneType + looping : bool + modules : NoneType + pool : list + queue : NoneType + worker_pool : dict + abort(signal, stack) + delegate() + loop() + request_interrupt(interrupt_level, job, remote_id, jobtype) + validate_datasources() + } + ProcessorPreset --|> BasicProcessor + BasicProcessor --|> BasicWorker + BasicHTTPScraper --|> BasicWorker + BasicJSONScraper --|> BasicHTTPScraper + Search --|> BasicProcessor + SearchWithScope --|> Search + InternalAPI --|> BasicWorker + DatasetCanceller --|> BasicWorker + QueryCanceller --|> BasicWorker + UpdateChecker --|> BasicWorker + TempFileCleaner --|> BasicWorker + DatasourceMetrics --|> BasicWorker + ThingExpirer --|> BasicWorker + FourcatRestarterAndUpgrader --|> BasicWorker +::: + +## packages + +:::mermaid +classDiagram + class backend { + } + class bootstrap { + } + class lib { + } + class database_mysql { + } + class manager { + } + class preset { + } + class processor { + } + class scraper { + } + class search { + } + class worker { + } + class workers { + } + class api { + } + class cancel_dataset { + } + class cancel_query { + } + class check_updates { + } + class cleanup_tempfiles { + } + class datasource_metrics { + } + class expire_items { + } + class restart_4cat { + } + bootstrap --> manager + preset --> processor + processor --> worker + scraper --> worker + search --> processor + api --> worker + cancel_dataset --> worker + cancel_query --> worker + check_updates --> worker + cleanup_tempfiles --> worker + datasource_metrics --> worker + expire_items --> worker + restart_4cat --> worker +::: + diff --git a/common/README.md b/common/README.md index b5e91f577..59eb4aedb 100644 --- a/common/README.md +++ b/common/README.md @@ -3,4 +3,6 @@ This folder contains files used by both the back-end daemon and the front-end web app. -Additionally, it contains assets (i.e. static files) used for analyses. \ No newline at end of file +Additionally, it contains assets (i.e. static files) used for analyses. + +Also see the [architecture diagrams](architecture.md). diff --git a/common/architecture.md b/common/architecture.md new file mode 100644 index 000000000..12d3f3896 --- /dev/null +++ b/common/architecture.md @@ -0,0 +1,454 @@ +# Architecture for 'common' + +## classes + +:::mermaid +classDiagram + class ConfigDummy { + } + class ConfigException { + } + class ConfigManager { + cache : dict + config_definition : dict + core_settings : dict + db : NoneType + dbconn : NoneType + tag_context : list + delete_for_tag(attribute_name, tag) + ensure_database() + get(attribute_name, default, is_json, user, tags) + get_active_tags(user, tags) + get_all(is_json, user, tags) + load_core_settings() + load_user_settings() + set(attribute_name, value, is_json, tag, overwrite_existing) + with_db(db) + } + class ConfigWrapper { + config + request : NoneType + tags : NoneType + user : NoneType + get() + get_active_tags(user, tags) + get_all() + request_override(tags) + set() + } + class CsvDialectException { + } + class DataSet { + available_processors : NoneType + children : NoneType + data : NoneType + db : NoneType + folder : NoneType + genealogy : NoneType + is_new : bool + key : str + modules : NoneType + no_status_updates : bool + owners : NoneType + parameters : NoneType + preset_parent : NoneType + result_file + staging_areas : NoneType + tagged_owners : NoneType + add_owner(username, role) + change_datasource(datasource) + check_dataset_finished() + clear_log() + copy(shallow) + copy_ownership_from(dataset, recursive) + delete(commit, queue) + delete_parameter(parameter, instant) + detach() + finish(num_rows) + finish_with_error(error) + get_all_children(recursive) + get_annotation_fields() + get_annotations() + get_available_processors(user, exclude_hidden) + get_breadcrumbs() + get_columns() + get_compatible_processors(user) + get_extension() + get_genealogy(inclusive) + get_key(query, parameters, parent, time_offset) + get_label(parameters, default) + get_log_path() + get_media_type() + get_metadata() + get_modules() + get_own_processor() + get_owners(role) + get_owners_users(role) + get_parameters() + get_parent() + get_place_in_queue(update) + get_progress() + get_result_url() + get_results_folder_path() + get_results_path() + get_staging_area() + get_status() + get_version_url(file) + is_accessible_by(username, role) + is_dataset() + is_expired(user) + is_expiring(user) + is_finished() + is_from_collector() + is_rankable(multiple_items) + is_top_dataset() + iterate_items(processor, warn_unmappable, map_missing) + link_job(job) + link_parent(key_parent) + log(log) + nearest(type_filter) + refresh_owners() + remove_owner(username) + remove_staging_areas() + reserve_result_file(parameters, extension) + set_key(key) + top_parent() + update_children() + update_label(label) + update_progress(progress) + update_status(status, is_final) + update_version(version) + warn_unmappable_item(item_count, processor, error_message, warn_admins) + } + class DataSetException { + } + class DataSetNotFoundException { + } + class Database { + appname : str + connection + cursor : NoneType + interruptable_job : NoneType + interruptable_timeout : int + interrupted : bool + log : NoneType + close() + commit() + delete(table, where, commit) + execute(query, replacements) + execute_many(query, commit, replacements) + fetchall(query) + fetchall_interruptable(queue, query) + fetchone(query) + get_cursor() + insert(table, data, commit, safe, constraints, return_field) + query(query, replacements, cursor) + rollback() + update(table, data, where, commit) + upsert(table, data, commit, constraints) + } + class DatabaseQueryInterruptedException { + } + class DatasetItem { + mapped_object + missing_fields + original + } + class DmiServiceManager { + local_or_remote + num_files_to_process : NoneType + path_to_files : NoneType, Path + path_to_results : NoneType, Path + processed_files : int + processor + server_address + server_file_collection_name : NoneType + server_results_folder_name : NoneType + check_gpu_memory_available(service_endpoint) + check_progress() + check_service_exists() + count_local_files(directory) + download_results(filenames_to_download, folder_name, local_output_dir, timeout) + get_folder_name(dataset) + process_files(input_file_dir, filenames, output_file_dir, server_file_collection_name, server_results_folder_name) + process_results(local_output_dir) + request_folder_files(folder_name) + sanitize_filenames(filename) + send_files(file_collection_name, results_name, files_to_upload, dir_with_files) + send_request_and_wait_for_results(service_endpoint, data, wait_period, check_process, callback) + } + class DmiServiceManagerException { + } + class DsmConnectionError { + } + class DsmOutOfMemory { + } + class FourcatException { + frame : NoneType + } + class FourcatModule { + } + class HTMLStripper { + convert_charrefs : bool + fed : list + strict : bool + get_data() + handle_data(data) + } + class HashCache { + hash_cache : dict + hasher + update_cache(value) + } + class Job { + data : dict + db : NoneType + details + is_claimed : bool + is_finished : bool + claim() + finish(delete) + get_by_ID(database) + get_by_data(database) + get_by_remote_ID(database, jobtype) + get_place_in_queue() + is_claimable() + release(delay, claim_after) + } + class JobAlreadyExistsException { + } + class JobClaimedException { + } + class JobNotFoundException { + } + class JobQueue { + db : NoneType + log : NoneType + add_job(jobtype, details, remote_id, claim_after, interval) + get_all_jobs(jobtype, remote_id, restrict_claimable) + get_job(jobtype, timestamp, restrict_claimable) + get_job_count(jobtype) + get_place_in_queue(job) + release_all() + } + class Logger { + alert_level : str + db : NoneType + levels : dict + log_path : NoneType + logger : NoneType + previous_report : int + print_logs : bool + critical(message, frame) + debug(message, frame) + error(message, frame) + fatal(message, frame) + info(message, frame) + log(message, level, frame) + warning(message, frame) + } + class MapItemException { + } + class MappedItem { + data + message : str + missing + get_item_data(safe) + get_message() + get_missing_fields() + } + class MappedItemIncompleteException { + } + class MissingMappedField { + value + } + class ModuleCollector { + PROCESSOR : int + WORKER : int + datasources : dict + ignore : list + log_buffer : NoneType + missing_modules : dict + processors : dict + workers : dict + expand_datasources() + is_4cat_class(object, only_processors) + load_datasources() + load_modules() + load_worker_class(worker) + } + class NullAwareTextIOWrapper { + } + class ProcessorException { + } + class ProcessorInterruptedException { + } + class QueryException { + } + class QueryNeedsExplicitConfirmationException { + } + class QueryNeedsFurtherInputException { + config + } + class QueryParametersException { + } + class QueueException { + } + class RequirementsNotMetException { + } + class SlackLogHandler { + mapLogRecord(record) + } + class User { + config : NoneType + data : NoneType + db : NoneType + is_active : bool + is_admin + is_anonymous : bool + is_authenticated : bool + is_deactivated + is_special + name : str + userdata : NoneType + add_notification(notification, expires, allow_dismiss) + add_tag(tag) + authenticate() + can_access_dataset(dataset, role) + clear_token() + delete(also_datasets) + dismiss_notification(notification_id) + email_token(new) + generate_token(username, regenerate) + get_by_login(db, name, password, config) + get_by_name(db, name, config) + get_by_token(db, token, config) + get_id() + get_name() + get_notifications() + get_token() + get_value(key, default) + remove_tag(tag) + set_password(password) + set_value(key, value) + sort_user_tags() + with_config(config) + } + class UserInput { + OPTIONS_COSMETIC : tuple + OPTION_CHOICE : str + OPTION_DATASOURCES : str + OPTION_DATE : str + OPTION_DATERANGE : str + OPTION_DIVIDER : str + OPTION_FILE : str + OPTION_HUE : str + OPTION_INFO : str + OPTION_MULTI : str + OPTION_MULTI_SELECT : str + OPTION_TEXT : str + OPTION_TEXT_JSON : str + OPTION_TEXT_LARGE : str + OPTION_TOGGLE : str + parse_all(options, input, silently_correct) + parse_value(settings, choice, other_input, silently_correct) + } + class WebHookLogHandler { + server_name : str + emit(record) + } + class WorkerInterruptedException { + } + DataSet --|> FourcatModule + DsmConnectionError --|> DmiServiceManagerException + DsmOutOfMemory --|> DmiServiceManagerException + ConfigException --|> FourcatException + CsvDialectException --|> FourcatException + DataSetException --|> FourcatException + DataSetNotFoundException --|> DataSetException + DatabaseQueryInterruptedException --|> WorkerInterruptedException + JobAlreadyExistsException --|> QueueException + JobClaimedException --|> QueueException + JobNotFoundException --|> QueueException + MapItemException --|> ProcessorException + MappedItemIncompleteException --|> MapItemException + ProcessorException --|> FourcatException + ProcessorInterruptedException --|> WorkerInterruptedException + QueryException --|> FourcatException + QueryNeedsExplicitConfirmationException --|> QueryException + QueryNeedsFurtherInputException --|> QueryException + QueryParametersException --|> QueryException + QueueException --|> FourcatException + WorkerInterruptedException --|> FourcatException + SlackLogHandler --|> WebHookLogHandler + ConfigManager --* User : config + Database --* ConfigManager : db + ModuleCollector --* DataSet : modules +::: + +## packages + +:::mermaid +classDiagram + class common { + } + class config_manager { + } + class lib { + } + class config_definition { + } + class database { + } + class dataset { + } + class dmi_service_manager { + } + class exceptions { + } + class fourcat_module { + } + class helpers { + } + class item_mapping { + } + class job { + } + class logger { + } + class module_loader { + } + class queue { + } + class user { + } + class user_input { + } + config_manager --> config_definition + config_manager --> database + config_manager --> exceptions + config_manager --> user_input + config_definition --> user_input + database --> exceptions + dataset --> config_manager + dataset --> exceptions + dataset --> fourcat_module + dataset --> helpers + dataset --> item_mapping + dataset --> job + dataset --> module_loader + dmi_service_manager --> helpers + helpers --> config_manager + helpers --> user_input + job --> exceptions + logger --> config_manager + module_loader --> config_manager + queue --> job + user --> config_manager + user --> dataset + user --> exceptions + user --> helpers + user_input --> exceptions + user_input --> helpers +::: + diff --git a/helper-scripts/generate-diagrams.sh b/helper-scripts/generate-diagrams.sh new file mode 100644 index 000000000..c5ba1938f --- /dev/null +++ b/helper-scripts/generate-diagrams.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env sh + +#################################################################################################### +# This script uses pyreverse (from the pylint package) to automatically generate architecture +# diagrams for each package in the project. +#################################################################################################### + +if ! command -v pyreverse &> /dev/null +then + echo "pyreverse could not be found. Please install pylint." + exit 1 +fi + +OUTPUT_FORMAT="mmd" +PYREVERSE_OPTIONS="--colorized -o $OUTPUT_FORMAT" + +SEARCH_DEPTH=2 # Search for Python packages up to this depth + +for PACKAGE_INIT in $(find . -d $SEARCH_DEPTH -name __init__.py) +do + PACKAGE_NAME=$(echo $PACKAGE_INIT | cut -f 2 -d /) + echo "Generating class diagram for package '$PACKAGE_NAME'..." + pyreverse $PYREVERSE_OPTIONS -p $PACKAGE_NAME $PACKAGE_NAME + + OUTPUT_FILE="$PACKAGE_NAME/architecture.md" + echo "Writing to file '$OUTPUT_FILE'..." + echo "# Architecture for '$PACKAGE_NAME'" > $OUTPUT_FILE + echo "" >> $OUTPUT_FILE + + for PREFIX in "classes" "packages" + do + echo "## $PREFIX" >> $OUTPUT_FILE + echo "" >> $OUTPUT_FILE + echo ":::mermaid" >> $OUTPUT_FILE + + MERMAID_FILE=${PREFIX}_${PACKAGE_NAME}.${OUTPUT_FORMAT} + cat $MERMAID_FILE >> $OUTPUT_FILE + rm $MERMAID_FILE + + echo ":::" >> $OUTPUT_FILE + echo "" >> $OUTPUT_FILE + + done +done diff --git a/setup.py b/setup.py index 4f40eb5e4..9144b07e5 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ "packaging", "pandas==1.5.3", "Pillow>=10.3", + "pylint~=2.17", # pin PyLint version 2 for 4Cat and pyreverse "praw~=7.0", "prawcore~=2.0", "psutil~=5.0", diff --git a/webtool/architecture.md b/webtool/architecture.md new file mode 100644 index 000000000..2971ea2fa --- /dev/null +++ b/webtool/architecture.md @@ -0,0 +1,89 @@ +# Architecture for 'webtool' + +## classes + +:::mermaid +classDiagram + class OpenAPICollector { + apis : set + endpoints : dict + flask_app : NoneType + type_map : dict + endpoint(api_id) + generate(api_id) + schema_to_schema(schema) + } + class Pagination { + has_next + has_prev + page + pages + per_page + route : str + route_args : dict + total_count + iter_pages(left_edge, left_current, right_current, right_edge) + } +::: + +## packages + +:::mermaid +classDiagram + class webtool { + } + class lib { + } + class helpers { + } + class openapi_collector { + } + class template_filters { + } + class views { + } + class api_explorer { + } + class api_standalone { + } + class api_tool { + } + class views_admin { + } + class views_dataset { + } + class views_extensions { + } + class views_misc { + } + class views_restart { + } + class views_user { + } + webtool --> helpers + webtool --> openapi_collector + webtool --> views_admin + template_filters --> webtool + template_filters --> helpers + api_explorer --> webtool + api_explorer --> helpers + api_standalone --> webtool + api_standalone --> helpers + api_tool --> webtool + api_tool --> helpers + views_admin --> webtool + views_admin --> helpers + views_dataset --> webtool + views_dataset --> helpers + views_dataset --> api_tool + views_extensions --> webtool + views_misc --> webtool + views_misc --> helpers + views_misc --> views_dataset + views_restart --> webtool + views_restart --> helpers + views_user --> webtool + views_user --> helpers + views_user --> api_tool +::: +