diff --git a/examples/entity_risk_analyzer.py b/examples/entity_risk_analyzer.py new file mode 100644 index 0000000..643cf27 --- /dev/null +++ b/examples/entity_risk_analyzer.py @@ -0,0 +1,83 @@ +from pathlib import Path + +from bigdata_client.models.search import DocumentType + +from bigdata_research_tools.client import bigdata_connection +from bigdata_research_tools.llm.base import LLMConfig +from bigdata_research_tools.utils.observer import OberserverNotification, Observer +from bigdata_research_tools.workflows.risk_analyzer import RiskAnalyzer + + +def risk_analyzer_example( + risk_scenario: str, + llm_model_config: str | LLMConfig | dict, + keywords: list|None = None, + control_entities: dict = {'place':['United States', 'China']}, + focus: str = "", + export_path: str = "entity_risk_analyzer_results.xlsx", +) -> dict: + GRID_watchlist_ID = "8747febb-8762-40f9-bf9b-4b9d6909deb4" + + bigdata = bigdata_connection() + # Retrieve the watchlist object + watchlist_grid = bigdata.watchlists.get(GRID_watchlist_ID) + # Access the items within the watchlist + entities = bigdata.knowledge_graph.get_entities(watchlist_grid.items) + + analyzer = RiskAnalyzer( + main_theme=risk_scenario, + entities=entities, + start_date="2025-09-01", + end_date="2025-09-30", + keywords=keywords, + document_type=DocumentType.NEWS, + control_entities=control_entities, + focus=focus, # Optional focus to narrow the theme, + llm_model_config=llm_model_config, + ground_mindmap=False, + ) + + class PrintObserver(Observer): + def update(self, message: OberserverNotification): + print(f"Notification received: {message}") + + analyzer.register_observer(PrintObserver()) + + return analyzer.screen_companies(export_path=export_path) + + +if __name__ == "__main__": + import logging + + from dotenv import load_dotenv + + # Load environment variables for authentication + print(f"Environment variables loaded: {load_dotenv()}") + + # Set the logging configuration to show the logs of the library + logging.basicConfig() + logging.getLogger("bigdata_research_tools").setLevel(logging.INFO) + + output_path = Path("outputs/entity_risk_analyzer_results.xlsx") + output_path.parent.mkdir(parents=True, exist_ok=True) + + x = risk_analyzer_example( + "US China Trade War", + focus="Generate a mind map of current and future risks that metals, rare earths and semiconductors commodities traders are facing as a result of increased trade tensions between the United States and China.", + export_path=str(output_path), + llm_model_config=LLMConfig( + model="openai::gpt-4o-mini", + ), + ) + # custom_config = { + # 'entity_column': 'Entity', + # 'heatmap_colorscale': 'Plasma', + # 'dashboard_height': 1800, + # 'top_themes_count': 5, + # 'main_title': 'Custom Thematic Analysis Dashboard' + # } + df = x["df_entity"] + # fig, industry_fig = create_thematic_exposure_dashboard(df, n_companies=15, config=custom_config) + # fig.show(renderer="browser") # Shows the main dashboard + # industry_fig.show(renderer="browser") # Shows the industry analysis + print(df.head(10)) # Display the first 10 rows of the DataFrame diff --git a/src/bigdata_research_tools/labeler/entity_labeler.py b/src/bigdata_research_tools/labeler/entity_labeler.py new file mode 100644 index 0000000..f09b946 --- /dev/null +++ b/src/bigdata_research_tools/labeler/entity_labeler.py @@ -0,0 +1,420 @@ +from logging import Logger, getLogger +from typing import Any + +from pandas import DataFrame, Series + +from bigdata_research_tools.labeler.labeler import Labeler +from bigdata_research_tools.llm.base import LLMConfig +from bigdata_research_tools.prompts.labeler import ( + get_other_entity_placeholder, + get_entity_risk_system_prompt, + get_entity_theme_system_prompt, + get_target_entity_placeholder, +) + +logger: Logger = getLogger(__name__) + +class EntityRiskLabeler(Labeler): + def __init__( + self, + llm_model_config: str | LLMConfig | dict = "openai::gpt-4o-mini", + label_prompt: str | None = None, + # TODO (cpinto, 2025.02.07) This value is also in the prompt used. + # Changing it here would break the process. + unknown_label: str = "unclear", + ): + """ + Args: + llm_model: Name of the LLM model to use. Expected format: + ::, e.g. "openai::gpt-4o-mini" + label_prompt: Prompt provided by user to label the search result chunks. + If not provided, then our default labelling prompt is used. + unknown_label: Label for unclear classifications + """ + super().__init__(llm_model_config, unknown_label) + self.label_prompt = label_prompt + + def get_labels( + self, + main_theme: str, + labels: list[str], + texts: list[str], + max_workers: int = 50, + timeout: int | None = 55, + textsconfig: list[dict[str, Any]] | None = None, + ) -> DataFrame: + """ + Process thematic labels for texts. + + Args: + main_theme: The main theme to analyze. + labels: Labels for labelling the chunks. + texts: List of chunks to label. + timeout: Timeout for each LLM request. + max_workers: Maximum number of concurrent workers. + + Returns: + DataFrame with schema: + - index: sentence_id + - columns: + - motivation + - label + """ + system_prompt = ( + get_entity_risk_system_prompt(main_theme, labels) + if self.label_prompt is None + else self.label_prompt + ) + + logger.info(f"Using system prompt: {system_prompt}") + + prompts = self.get_prompts_for_labeler(texts, textsconfig) + + responses = self._run_labeling_prompts( + prompts, + system_prompt, + max_workers=max_workers, + timeout=timeout, + processing_callbacks=[ + self.parse_labeling_response, + self._deserialize_label_response, + ], + ) + + return self._convert_to_label_df(responses) + + def post_process_dataframe(self, df: DataFrame, extra_fields: dict, extra_columns: list[str]) -> DataFrame: + """ + Post-process the labeled DataFrame. + + Args: + df: DataFrame to process. Schema: + - Index: int + - Columns: + - timestamp_utc: datetime64 + - document_id: str + - sentence_id: str + - headline: str + - entity_id: str + - entity_name: str + - entity_country: str + - text: str + - other_entities: str + - entities: List[Dict[str, Any]] + - key: str + - name: str + - start: int + - end: int + - masked_text: str + - other_entities_map: List[Tuple[int, str]] + - label: str + - motivation: str + Returns: + Processed DataFrame. Schema: + - index: int + - Columns: + - Time Period + - Date + - Entity + - Country + - Document ID + - Headline + - Quote + - Motivation + - Theme + - Sentiment + """ + # Filter unlabeled sentences + df = df.loc[df["label"] != "unclear"].copy() + if df.empty: + print(f"Empty dataframe: all rows labelled unclear") + return df + + # Process timestamps + df["timestamp_utc"] = df["timestamp_utc"].dt.tz_localize(None) + + # Sort and format + sort_columns = ["entity_name", "timestamp_utc", "label"] + df = df.sort_values(by=sort_columns).reset_index(drop=True) + + # Replace company placeholders + df["motivation"] = df.apply(replace_company_placeholders, axis=1) + + # Add formatted columns + df["Time Period"] = df["timestamp_utc"].dt.strftime("%b %Y") + df["Date"] = df["timestamp_utc"].dt.strftime("%Y-%m-%d") + + df["Document ID"] = df["document_id"] if "document_id" in df.columns else df["rp_document_id"] + + columns_map = { + "entity_name": "Entity", + "entity_type": "Entity Type", + "entity_country": "Country", + "headline": "Headline", + "text": "Quote", + "sentiment": "Sentiment", + "motivation": "Motivation", + "label": "Sub-Scenario", + "other_entities_name": "Other Entities", + "other_entities_id": "Other Entities IDs", + "other_entities_type": "Other Entities Types", + } + + if 'entity_sentiment' in df.columns: + columns_map.update({ + "entity_sentiment": "Entity Sentiment", + "entity_text_sentiment": "Entity Text Sentiment" + }) + + if extra_fields: + columns_map.update(extra_fields) + if "quotes" in extra_fields.keys(): + if "quotes" in df.columns: + df["quotes"] = df.apply(replace_company_placeholders, axis=1, col_name = 'quotes') + else: + print("quotes column not in df") + + df = df.rename( + columns=columns_map + ) + + # Select and order columns + export_columns = [ + "Time Period", + "Date", + "Entity", + "Entity Type", + "Country", + "Document ID", + "Headline", + "Quote", + "Sentiment", + "Motivation", + "Sub-Scenario", + "Other Entities", + "Other Entities IDs", + "Other Entities Types" + ] + + if extra_columns: + export_columns += extra_columns + + return df[export_columns] + +class EntityScreenerLabeler(Labeler): + def __init__( + self, + llm_model_config: str | LLMConfig | dict = "openai::gpt-4o-mini", + label_prompt: str | None = None, + # TODO (cpinto, 2025.02.07) This value is also in the prompt used. + # Changing it here would break the process. + unknown_label: str = "unclear", + ): + """ + Args: + llm_model: Name of the LLM model to use. Expected format: + ::, e.g. "openai::gpt-4o-mini" + label_prompt: Prompt provided by user to label the search result chunks. + If not provided, then our default labelling prompt is used. + unknown_label: Label for unclear classifications + """ + super().__init__(llm_model_config, unknown_label) + self.label_prompt = label_prompt + + def get_labels( + self, + main_theme: str, + labels: list[str], + texts: list[str], + max_workers: int = 50, + timeout: int | None = 55, + textsconfig: list[dict[str, Any]] | None = None, + ) -> DataFrame: + """ + Process thematic labels for texts. + + Args: + main_theme: The main theme to analyze. + labels: Labels for labelling the chunks. + texts: List of chunks to label. + timeout: Timeout for each LLM request. + max_workers: Maximum number of concurrent workers. + + Returns: + DataFrame with schema: + - index: sentence_id + - columns: + - motivation + - label + """ + system_prompt = ( + get_entity_theme_system_prompt(main_theme, labels) + if self.label_prompt is None + else self.label_prompt + ) + + prompts = self.get_prompts_for_labeler(texts, textsconfig) + + responses = self._run_labeling_prompts( + prompts, + system_prompt, + max_workers=max_workers, + timeout=timeout, + processing_callbacks=[ + self.parse_labeling_response, + self._deserialize_label_response, + ], + ) + + return self._convert_to_label_df(responses) + + def post_process_dataframe(self, df: DataFrame, extra_fields: dict, extra_columns: list[str]) -> DataFrame: + """ + Post-process the labeled DataFrame. + + Args: + df: DataFrame to process. Schema: + - Index: int + - Columns: + - timestamp_utc: datetime64 + - document_id: str + - sentence_id: str + - headline: str + - entity_id: str + - entity_name: str + - entity_country: str + - text: str + - other_entities: str + - entities: List[Dict[str, Any]] + - key: str + - name: str + - start: int + - end: int + - masked_text: str + - other_entities_map: List[Tuple[int, str]] + - label: str + - motivation: str + Returns: + Processed DataFrame. Schema: + - index: int + - Columns: + - Time Period + - Date + - Entity + - Country + - Document ID + - Headline + - Quote + - Motivation + - Theme + - Sentiment + """ + # Filter unlabeled sentences + df = df.loc[df["label"] != "unclear"].copy() + if df.empty: + print(f"Empty dataframe: all rows labelled unclear") + return df + + # Process timestamps + df["timestamp_utc"] = df["timestamp_utc"].dt.tz_localize(None) + + # Sort and format + sort_columns = ["entity_name", "timestamp_utc", "label"] + df = df.sort_values(by=sort_columns).reset_index(drop=True) + + # Replace company placeholders + df["motivation"] = df.apply(replace_company_placeholders, axis=1) + + # Add formatted columns + df["Time Period"] = df["timestamp_utc"].dt.strftime("%b %Y") + df["Date"] = df["timestamp_utc"].dt.strftime("%Y-%m-%d") + + df["Document ID"] = df["document_id"] if "document_id" in df.columns else df["rp_document_id"] + + columns_map = { + "entity_name": "Entity", + "entity_type": "Entity Type", + "entity_country": "Country", + "headline": "Headline", + "text": "Quote", + "sentiment": "Sentiment", + "motivation": "Motivation", + "label": "Theme", + "other_entities_name": "Other Entities", + "other_entities_id": "Other Entities IDs", + "other_entities_type": "Other Entities Types", + } + + if extra_fields: + columns_map.update(extra_fields) + if "quotes" in extra_fields.keys(): + if "quotes" in df.columns: + df["quotes"] = df.apply(replace_company_placeholders, axis=1, col_name = 'quotes') + else: + print("quotes column not in df") + + df = df.rename( + columns=columns_map + ) + + # Select and order columns + export_columns = [ + "Time Period", + "Date", + "Entity", + "Entity Type", + "Country", + "Document ID", + "Headline", + "Quote", + "Sentiment", + "Motivation", + "Theme", + "Other Entities", + "Other Entities IDs", + "Other Entities Types" + ] + + if extra_columns: + export_columns += extra_columns + + return df[export_columns] + +def replace_company_placeholders( + row: Series, col_name: str = "motivation" +) -> str | list[str]: + """ + Replace company placeholders in text. + + Args: + row: Row of the DataFrame. Expected columns: + - motivation: str + - entity_name: str + - other_entities_map: List[Tuple[int, str]] + Returns: + Text with placeholders replaced. + """ + text = row[col_name] + entity_type = row.get("entity_type", "COMP") + if isinstance(text, str): + text = text.replace(get_target_entity_placeholder(entity_type), row["entity_name"]) + if row.get("other_entities_map"): + for entity_id, entity_name in row["other_entities_map"]: + text = text.replace( + f"{get_other_entity_placeholder(entity_type)}_{entity_id}", entity_name + ) + + elif isinstance(text, list): + text = [ + t.replace(get_target_entity_placeholder(entity_type), row["entity_name"]) for t in text + ] + if row.get("other_entities_map"): + for entity_id, entity_name in row["other_entities_map"]: + text = [ + t.replace( + f"{get_other_entity_placeholder(entity_type)}_{entity_id}", entity_name + ) + for t in text + ] + + return text \ No newline at end of file diff --git a/src/bigdata_research_tools/labeler/risk_labeler.py b/src/bigdata_research_tools/labeler/risk_labeler.py index 6a18cb6..2cf129f 100644 --- a/src/bigdata_research_tools/labeler/risk_labeler.py +++ b/src/bigdata_research_tools/labeler/risk_labeler.py @@ -228,23 +228,24 @@ def replace_company_placeholders( Text with placeholders replaced. """ text = row[col_name] + entity_type = row.get("entity_type", "COMP") if isinstance(text, str): - text = text.replace(get_target_entity_placeholder(), row["entity_name"]) + text = text.replace(get_target_entity_placeholder(entity_type), row["entity_name"]) if row.get("other_entities_map"): for entity_id, entity_name in row["other_entities_map"]: text = text.replace( - f"{get_other_entity_placeholder()}_{entity_id}", entity_name + f"{get_other_entity_placeholder(entity_type)}_{entity_id}", entity_name ) elif isinstance(text, list): text = [ - t.replace(get_target_entity_placeholder(), row["entity_name"]) for t in text + t.replace(get_target_entity_placeholder(entity_type), row["entity_name"]) for t in text ] if row.get("other_entities_map"): for entity_id, entity_name in row["other_entities_map"]: text = [ t.replace( - f"{get_other_entity_placeholder()}_{entity_id}", entity_name + f"{get_other_entity_placeholder(entity_type)}_{entity_id}", entity_name ) for t in text ] diff --git a/src/bigdata_research_tools/labeler/screener_labeler.py b/src/bigdata_research_tools/labeler/screener_labeler.py index 28fc5a7..42aeccc 100644 --- a/src/bigdata_research_tools/labeler/screener_labeler.py +++ b/src/bigdata_research_tools/labeler/screener_labeler.py @@ -227,10 +227,11 @@ def replace_company_placeholders(row: Series) -> str: Text with placeholders replaced. """ text = row["motivation"] - text = text.replace(get_target_entity_placeholder(), row["entity_name"]) + entity_type = row.get("entity_type", "COMP") + text = text.replace(get_target_entity_placeholder(entity_type), row["entity_name"]) if row.get("other_entities_map"): for entity_id, entity_name in row["other_entities_map"]: text = text.replace( - f"{get_other_entity_placeholder()}_{entity_id}", entity_name + f"{get_other_entity_placeholder(entity_type)}_{entity_id}", entity_name ) return text diff --git a/src/bigdata_research_tools/mindmap/mindmap_utils.py b/src/bigdata_research_tools/mindmap/mindmap_utils.py index 0f8e25b..eccae1e 100644 --- a/src/bigdata_research_tools/mindmap/mindmap_utils.py +++ b/src/bigdata_research_tools/mindmap/mindmap_utils.py @@ -148,6 +148,147 @@ """ ), }, + "risk_entity": { + "qualifier": "Risk Scenario", + "user_prompt_message": "Your given Risk Scenario is: {main_theme}", + "default_instructions": ( + "Forget all previous prompts." + "You are assisting a professional risk analyst tasked with creating a taxonomy to classify the impact of the Risk Scenario '**{main_theme}**' on entities, such as countries, commodities, geographical places, and organizations." + "Your objective is to generate a **comprehensive tree structure** that maps the **risk spillovers** stemming from the Risk Scenario '**{main_theme}**', and generates related sub-scenarios. " + "Key Instructions:" + "1. **Understand the Risk Scenario: '{main_theme}'**:" + " - The Risk Scenario '**{main_theme}**' represents a central, multifaceted concept that may be harmful or beneficial to an entity." + " - Your task is to identify how the Risk Scenario impacts entities through various **risk spillovers** and transmission channels." + " - Summarize the Risk Scenario '**{main_theme}**' in a **short list of essential keywords**." + " - The keyword list should be short (1-2 keywords). Avoid unnecessary, unmentioned, indirectly inferred, or redundant keywords." + "2. **Create a Tree Structure for Risk Spillovers and Sub-Scenarios**:" + " - Decompose the Risk Scenario into **distinct, focused, and self-contained risk spillovers**." + " - Each risk spillover must represent a **specific risk channel** through which entities are exposed to as a consequence of the Risk Scenario." + " - Label each **primary node** in the tree explicitly as a \"Risk\" in the `Label` field. For example:" + " - Use 'Cost Risk' instead of 'Cost Impacts'." + " - Use 'Supply Chain Risk' instead of 'Supply Chain Disruptions'." + " - Risk spillovers must:" + " - Cover a wide range of potential impacts on entities' and long-term stabiliity and growth." + " - Explore both macroeconomic and microeconomic dimensions of the Risk Scenario '**{main_theme}**' and analyze their impact on entities when relevant." + " - Include **direct and indirect consequences** of the main scenario." + " - Represent **dimensions of risk** that entities must monitor or mitigate." + " - NOT overlap." + " - Independently identify the most relevant spillovers based on the Risk Scenario '**{main_theme}**', without limiting to predefined categories." + "3. **Generate Sub-Scenarios for Each Risk Spillover**:" + " - For each risk spillover, identify **specific sub-scenarios** that will arise as a consequence of the Risk Scenario '**{main_theme}**'." + " - All sub-scenarios must:" + " - Be **concise and descriptive sentences**, clearly stating how the sub-scenario is an event caused by the main scenario." + " - **Explicitly include ALL core concepts and keywords** from the main scenario, including specific geographical locations or temporal details, in every sentence in order to ensure clarity and relevance towards the main scenario." + " - Integrate the Risk Scenario in a natural way, avoiding repetitive or mechanical structures." + " - Not exceed 15 words." + " - Sub-scenarios MUST be mutually exclusive: they CANNOT overlap neither within nor across branches of the tree." + " - Do NOT combine multiple sub-scenarios in a single label." + " - Sub-Scenarios have to be consistent with the parent Risk Spillover (e.g. Market Access related sub-scenarios have to belong to the Market Access Risk node)." + " - Generate 3 OR MORE sub-scenarios for each risk spillover." + " - Generate a short label for each subscenario." + "4. **Iterate Based on the Analyst's Focus: '{analyst_focus}'**:" + " - After generating the initial tree structure, use the analyst's focus ('{analyst_focus}') to:" + " - Identify **missing branches** or underexplored areas of the tree." + " - Add new risk spillovers or sub-scenarios that align with the analyst's focus." + " - Ensure that sub-scenarios ALWAYS include ALL core components of the Risk Scenario and are formulated as natural sentences." + " - Ensure that sub-scenarios DO NOT overlap within and across risk spillovers." + " - Ensure that sub-scenarios belong to the correct Risk Spillover." + " - If the analyst focus is empty, skip this step." + " - If you don't understand the analyst focus ('{analyst_focus}'), ask an open-ended question to the analyst." + "5. **Review and Expand the Tree for Missing Risks**:" + " - After incorporating the analyst's focus, review the tree structure to ensure it includes a **broad range of risks** and sub-scenarios." + " - Add any missing risks or sub-scenarios to the tree." + ), + "enforce_structure_string": ( + """IMPORTANT: Your response MUST be a valid JSON object. Each node in the JSON object must include:\n" + " - `node`: an integer representing the unique identifier for the node.\n" + " - `label`: a string for the name of the sub-theme.\n" + " - `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the main theme or risk.\n" + " - `children`: an array of child nodes.\n" + "Format the JSON object as a nested dictionary. Be careful when specifying keys and items.\n" + "Avoid overlapping labels. Break down joint concepts into unique parents so that each parent represents ONLY ONE concept. AVOID creating branch names such as 'Compliance and Regulatory Risk'. Keep risks separate and create a single branch for each risk, such as 'Compliance Risk' and 'Regulatory Risk', each with their own children.\n" + "Return ONLY the JSON object, with no extra text, explanation, or markdown.\n" + "You MUST use ONLY these field names: label, node, summary, children. Do NOT use underscores, spaces, or any other characters in field names. If you use any other field names, your answer will be rejected.\n" + "## Example Structure:\n" + "**Theme: Global Warming**\n\n" + "{\n" + " \"node\": 1,\n" + " \"label\": \"Global Warming\",\n" + " \"summary\": \"Global Warming is a serious risk\",\n" + " \"children\": [\n" + " {\"node\": 2, \"label\": \"Renewable Energy Adoption\", \"summary\": \"Renewable energy reduces greenhouse gas emissions and thereby global warming and climate change effects\", \"children\": [\n" + " {\"node\": 5, \"label\": \"Solar Energy\", \"summary\": \"Solar energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 6, \"label\": \"Wind Energy\", \"summary\": \"Wind energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 7, \"label\": \"Hydropower\", \"summary\": \"Hydropower reduces greenhouse gas emissions\"}\n" + " ]},\n" + " {\"node\": 3, \"label\": \"Carbon Emission Reduction\", \"summary\": \"Carbon emission reduction decreases greenhouse gases\", \"children\": [\n" + " {\"node\": 8, \"label\": \"Carbon Capture Technology\", \"summary\": \"Carbon capture technology reduces atmospheric CO2\"},\n" + " {\"node\": 9, \"label\": \"Emission Trading Systems\", \"summary\": \"Emission trading systems incentivize reductions in greenhouse gases\"}\n" + " ]}\n" + " ]\n" + "}\n" + """ + ), + }, + "theme_entity": { + "qualifier": "Main Theme", + "user_prompt_message": "Your given Theme is: {main_theme}", + "default_instructions": ( + "Forget all previous prompts." + "You are assisting a professional analyst tasked with creating a screener to measure the impact of the theme {main_theme} on other entities, such countries, commodities, geographical places, and organizations." + "Your objective is to generate a comprehensive tree structure of distinct sub-themes that will guide the analyst's research process." + "Follow these steps strictly:" + "1. **Understand the Core Theme {main_theme}**:" + " - The theme {main_theme} is a central concept. All components are essential for a thorough understanding." + "2. **Create a Taxonomy of Sub-themes for {main_theme}**:" + " - Decompose the main theme {main_theme} into concise, focused, and self-contained sub-themes." + " - Each sub-theme should represent a singular, concise, informative, and clear aspect of the main theme." + " - Expand the sub-theme to be relevant for the {main_theme}: a single word is not informative enough." + " - Prioritize clarity and specificity in your sub-themes." + " - Avoid repetition and strive for diverse angles of exploration." + " - Provide a comprehensive list of potential sub-themes." + "3. **Iterate Based on the Analyst's Focus {analyst_focus}**:" + " - If no specific {analyst_focus} is provided, transition directly to formatting the JSON response." + "4. **Format Your Response as a JSON Object**:" + " - Each node in the JSON object must include:" + " - `node`: an integer representing the unique identifier for the node." + " - `label`: a string for the name of the sub-theme." + " - `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the theme {main_theme}." + " - For the node referring to the first node {main_theme}, just define briefly in maximum 15 words the theme {main_theme}." + " - `children`: an array of child nodes." + ), + "enforce_structure_string": ( + """IMPORTANT: Your response MUST be a valid JSON object. Each node in the JSON object must include:\n" + "- `node`: an integer representing the unique identifier for the node.\n" + "- `label`: a string for the name of the sub-theme.\n" + "- `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the theme.\n" + "- For the node referring to the main theme, just define briefly in maximum 15 words the theme.\n" + "- `children`: an array of child nodes.\n" + "Format the JSON object as a nested dictionary. Be careful when specifying keys and items.\n" + "Avoid overlapping labels. Break down joint concepts into unique parents so that each parent represents ONLY ONE concept. AVOID creating branch names such as 'Compliance and Regulatory Risk'. Keep risks separate and create a single branch for each risk, such as 'Compliance Risk' and 'Regulatory Risk', each with their own children.\n" + "Return ONLY the JSON object, with no extra text, explanation, or markdown.\n" + "You MUST use ONLY these field names: label, node, summary, children. Do NOT use underscores, spaces, or any other characters in field names. If you use any other field names, your answer will be rejected.\n" + "## Example Structure:\n" + "**Theme: Global Warming**\n\n" + "{\n" + " \"node\": 1,\n" + " \"label\": \"Global Warming\",\n" + " \"summary\": \"Global Warming is a serious risk\",\n" + " \"children\": [\n" + " {\"node\": 2, \"label\": \"Renewable Energy Adoption\", \"summary\": \"Renewable energy reduces greenhouse gas emissions and thereby global warming and climate change effects\", \"children\": [\n" + " {\"node\": 5, \"label\": \"Solar Energy\", \"summary\": \"Solar energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 6, \"label\": \"Wind Energy\", \"summary\": \"Wind energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 7, \"label\": \"Hydropower\", \"summary\": \"Hydropower reduces greenhouse gas emissions\"}\n" + " ]},\n" + " {\"node\": 3, \"label\": \"Carbon Emission Reduction\", \"summary\": \"Carbon emission reduction decreases greenhouse gases\", \"children\": [\n" + " {\"node\": 8, \"label\": \"Carbon Capture Technology\", \"summary\": \"Carbon capture technology reduces atmospheric CO2\"},\n" + " {\"node\": 9, \"label\": \"Emission Trading Systems\", \"summary\": \"Emission trading systems incentivize reductions in greenhouse gases\"}\n" + " ]}\n" + " ]\n" + "}\n" + """ + ), + }, } diff --git a/src/bigdata_research_tools/portfolio/motivation.py b/src/bigdata_research_tools/portfolio/motivation.py index 4f9555e..3ae10ae 100644 --- a/src/bigdata_research_tools/portfolio/motivation.py +++ b/src/bigdata_research_tools/portfolio/motivation.py @@ -80,12 +80,13 @@ def group_quotes_by_company(self, filtered_df: pd.DataFrame) -> dict: ] if missing_columns: - available_columns = list(filtered_df.columns) - raise ValueError( - f"Missing required columns: {missing_columns}. " - f"Available columns are: {available_columns}" - ) - + required_columns = ["Entity", "Quote", "Theme"] + missing_columns = [col for col in required_columns if col not in filtered_df.columns] + if missing_columns: + raise ValueError( + f"Missing required columns: {missing_columns}. " + f"Available columns are: {list(filtered_df.columns)}" + ) # Check if DataFrame is empty if filtered_df.empty: logger.warning("Warning: DataFrame is empty. Returning empty dictionary.") @@ -95,7 +96,7 @@ def group_quotes_by_company(self, filtered_df: pd.DataFrame) -> dict: # Use .get() with default values as additional safety for _, row in filtered_df.iterrows(): - company = row.get("Company", "Unknown Company") + company = row.get("Company", "Entity") quote = row.get("Quote", "") theme = row.get("Theme", "Unknown Theme") @@ -152,6 +153,7 @@ def generate_company_motivations( theme_name: str, word_range: tuple[int, int], use_case: MotivationType = MotivationType.THEMATIC_SCREENER, + entity_type: str = "COMP", ) -> pd.DataFrame: """ Generates motivation statement with specified verbosity for companies in a thematic watchlist. @@ -183,6 +185,7 @@ def generate_company_motivations( word_range[0], word_range[1], use_case=use_case, + entity_type=entity_type, ) # Generate motivation with this word range diff --git a/src/bigdata_research_tools/prompts/labeler.py b/src/bigdata_research_tools/prompts/labeler.py index f7ae4f0..a556805 100644 --- a/src/bigdata_research_tools/prompts/labeler.py +++ b/src/bigdata_research_tools/prompts/labeler.py @@ -1,12 +1,15 @@ -from os import environ +def get_other_entity_placeholder(entity_type:str) -> str: + if entity_type == 'COMP': + return "Other Company" + else: + return "Other Entity" -def get_other_entity_placeholder() -> str: - return environ.get("BIGDATA_OTHER_ENTITY_PLACEHOLDER", "Other Company") - - -def get_target_entity_placeholder() -> str: - return environ.get("BIGDATA_TARGET_ENTITY_PLACEHOLDER", "Target Company") +def get_target_entity_placeholder(entity_type:str) -> str: + if entity_type == 'COMP': + return "Target Company" + else: + return "Target Entity" narrative_system_prompt_template: str = """ @@ -314,3 +317,225 @@ def get_risk_system_prompt(main_theme: str, label_summaries: list[str]) -> str: return risk_system_prompt_template.format( main_theme=main_theme, label_summaries=label_summaries ) + +def get_entity_risk_system_prompt(main_theme: str, label_summaries: list) -> str: + """ + Generate the risk entity labeler prompt with the provided parameters. + + Args: + main_theme (str): The main risk theme being analyzed + label_summaries (list): List of risk sub-scenario summaries + + Returns: + str: The formatted prompt string + """ + return entity_risk_system_prompt_template.format( + main_theme=main_theme, label_summaries=label_summaries + ) + +entity_risk_system_prompt_template: str = """Forget all previous prompts. + +You are assisting a professional analyst in evaluating both the exposure and risk classification for "Target Entity" regarding the Risk Scenario "{main_theme}". +This involves a two-step process: confirming exposure of "Target Entity" and classifying specific risks if exposure is confirmed. Use the headline for contextual understanding. + + +You will receive the following information:: +- ID: [text ID] +- Headline: [The Headline of the News Article containing Text] +- Text: [Paragraph requiring analysis] +- Risk Scenario: "{main_theme}" + + +Follow these guidelines: + + +- Examine whether the text explicitly mentions the Risk Scenario "{main_theme}" or any of its core components. +- Ensure that "Target Entity" is the main focus of the text and that it is clearly stated that "Target Entity" is facing or will face consequences caused by the Risk Scenario "{main_theme}". +- Assess if there are DIRECT consequences on "Target Entity’s" internal and external activities, operations, future performance, stability, sustainability, and long term growth. +- Designate the exposure as unclear if the text lacks an explicit DIRECT link between "Target Entity" and the Risk Scenario +- Designate the exposure as unclear if the text relies on generic information. + + + +If direct exposure of Target Entity is confirmed: + +- Identify and classify the specific risk using this list of Risk Sub-Scenarios: + "{label_summaries}". + +- Follow a detailed classification process: + - Examine the text to confirm how the Risk Scenario "{main_theme}" directly impacts "Target Entity" through one of the Risk Sub-Scenarios. + - Write a concise motivation that explains the direct link between "Target Entity" and the Risk Sub-Scenario as stated in the text. + - The motivation should always start with "Target Entity". + - Identify an appropriate Risk Sub-Scenario label from the list that describes explicitly the impact on Target Entity's internal and external activities, operations, stability, sustainability, and long term growth or performance. + - Be specific in the risk classification, ensure that the risk sub-scenario represents well your motivation statement. + - Ensure that the Risk Sub-Scenario label can be directly extracted from the text that it describes with high granularity how "Target Entity" is affected. + - Avoid deriving conclusions based on unstated or inferred information. Focus only on the explicit content of the text or headline. + + + +- Extract verbatim quotes from the text that support the classification and illustrate Target Entity's exposure to the specific Risk Sub-Scenario. +- Ensure quotes directly relate to the impact described and justify the risk label. +- Extract full sentences or phrases that clearly indicate, as standalone statements, how "Target Entity" is affected by the Risk Scenario "{main_theme}" and the Sub-Scenario label assigned. + + + +Structure your response as a JSON object with the sentence ID as the key containing: +"motivation": A concise explanation describing the link between "Target Entity" and the Risk Sub-Scenario. +"label": State the specific risk Sub-Scenario label or 'unclear'. +"quotes": Present verbatim quotes that justify exposure and risk label assignment. + +Format: {{"": +{{"motivation": "", "label": "", + "quotes": ""}} +}}. + + + +ID: 3 +Headline: "Tariffs to Strain Supply Chains Globally" +Text: "New tariffs against China will significantly impact Target Entity's operations due to its reliance on raw materials from Chinese suppliers." +Scenario: "New Tariffs against China" +Output: + +{{3:{{ + "motivation": "Target Entity's supply operations are directly impacted by new tariffs due to their reliance on raw materials sourced from China.", + "label": "Supply Chain Disruption", + "quotes": ["New tariffs against China will significantly impact Target Entity's operations", "reliance on raw materials from Chinese suppliers"] +}} +}} + +ID: 5 +Headline: "Interest Rate Fluctuations to Affect Markets" +Text: "Target Entity's analysts are forecasting higher risks associated with potential interest rate changes." +Scenario: "Interest Rate Volatility" +Output: + +{{5:{{ + "motivation": "Target Entity's analysts are forecasting higher risks associated with potential interest rate changes.", + "label": "unclear", + "quotes": [] +}} + +ID: 2 +Headline: "Economic Challenges Ahead Due to Tariffs on China" +Text: "Other Entity's analysts report a potential economic downturn in Target Entity linked to new tariffs against China." +Risk Scenario: "New Tariffs Against China" +Output: + +{{2:{{ + "motivation": "Target Entity's analysts are assessing the potential economic impact of new tariffs against China.", + "label": "Economic Downturns", + "quotes": ["Other Entity's analysts report a potential economic downturn in Target Entity"] +}} + +ID: 3 +Headline: "Analyzing External Factors in Business Strategy" +Text: "Other Entity is studying external factors such as tariffs to gauge potential risks. Other Entity's analysts report a potential economic downturn in Target Entity." +Risk Scenario: "New Tariffs on Semiconductors" +Output: + +{{3:{{ + "motivation": "Other Entity's analysis of external factors does not establish a direct link to Target Entity.", + "label": "unclear", + "quotes": [] +}} + +ID: 4 +Headline: "Market Trends Influence Stock Performance" +Text: "Target Entity’s stock is influenced by broad market trends." +Risk Scenario: "Increased Uncertainty and Volatility" +Output: + +{{4:{{ + "motivation": "The text does not related to the Risk Scenario and it does not mention any specific risk sub-scenario affecting Target Entity.", + "label": "unclear", + "quotes": [] +}} + +ID: 5 + +Headline: "Tariffs and Their Economic Impact" +Text: "Other Entity researchers estimate that tariffs will affect the broader economy in Target Entity." +Risk Scenario: "New Tariffs against China" +Output: + +{{5:{{ + "motivation": "Target Entity is not linked with any specific risk sub-scenario or any tangible effect of the Risk Scenario.", + "label": "unclear", + "quotes": [] +}} + +ID: 2 +Headline: "China Tariffs Impact Supply Chains" +Text: "According to recent reports, Target Entity is heavily dependent on China. The recent tariffs against China have forced Target Entity to reconsider its supply chain, potentially leading to increased logistics costs." +Risk Scenario: "New Tariffs against China" +Output: + +{{2:{{ + "motivation": "Target Entity is said to be reconsidering its supply chain in the face of the risk scenario. The text clearly links Target Entity with the Risk Scenario and mentions an explicit Sub-scenario risk of Supply Chain Disruptions.", + "label": "Supply Chain Disruption", + "quotes": [ + "Target Entity is heavily dependent on China", + "The recent tariffs against China have forced Target Entity to reconsider its supply chain, potentially leading to increased logistics costs." + ], +}} + +""" + +def get_entity_theme_system_prompt(main_theme: str, label_summaries: list) -> str: + """ + Generate the entity screener labeler prompt with the provided parameters. + + Args: + main_theme (str): The main theme being analyzed + label_summaries (list): List of risk theme summaries + + Returns: + str: The formatted prompt string + """ + return entity_theme_system_prompt_template.format( + main_theme=main_theme, label_summaries=label_summaries, + ) + +entity_theme_system_prompt_template: str = """ + Forget all previous prompts. + You are assisting a professional analyst in evaluating the impact of the theme '{main_theme}' on an entity "Target Entity". + Your primary task is first, to ensure that each sentence is explicitly related to '{main_theme}', and second, to accurately associate each given sentence with + the relevant label contained within the list '{label_summaries}'. + + Please adhere strictly to the following guidelines: + + 1. **Analyze the Sentence**: + - Each input consists of a sentence ID, an entity name ('Target Entity'), and the sentence text. + - Analyze the sentence to understand if the content clearly establishes a connection to '{main_theme}'. + - Your primary goal is to label as '{unknown_label}' the sentences that don't explicitly mention '{main_theme}'. + - Analyze the list of labels '{label_summaries}' used for label assignment. '{label_summaries}' is a Python list variable containing distinct labels and their definition in format 'Label: Summary', you must pick label only from 'Label' part which means left side of the semicolon for each Label:Summary pair. + - Your secondary goal is to select the most appropriate label from '{label_summaries}' that corresponds to the content of the sentence. + + 2. **First Label Assignment**: + - Assign the label '{unknown_label}' to the sentence related to "Target Entity" when it does not explicitly mentions '{main_theme}'. Otherwise, don't assign a label. + - Evaluate each sentence independently, focusing solely on the context provided within that specific sentence. + - Use only the information contained within the sentence for your label assignment. + - When evaluating the sentence, "Target Entity" must clearly mention that the entity is clearly impacted by '{main_theme}'. + - Many sentences are only tangentially connected to the topic '{main_theme}'. These sentences must be assigned the label '{unknown_label}'. + + 3. **Second Label Assignment**: + - For the sentences not labeled as '{unknown_label}' and only for them, assign a unique label from the list '{label_summaries}' to the sentence related to "Target Entity". + - Evaluate each sentence independently, focusing solely on the context provided within that specific sentence. + - Use only the information contained within the sentence for your label assignment. + - Ensure that the sentence clearly establishes a connection to the label you assigned and to the theme '{main_theme}'. + - You must not create a new label or choose a label that is not present in '{label_summaries}'. + - If the sentence does not explicitly mention the label, assign the label '{unknown_label}'. + - When evaluating the sentence, "Target Entity" must clearly mention that the entity is impacted by the label assigned and '{main_theme}'. + + 4. **Response Format**: + - Your output should be structured as a JSON object that includes: + 1. A brief motivation for your choice. + 2. The assigned label. + - Each entry must start with the sentence ID and contain a clear motivation that begins with "Target Entity". + - The motivation should explain why the label was selected from '{label_summaries}' based on the information in the sentence and in the context of '{main_theme}'. It should also justify the label that had been assigned. + - Ensure that the exact context is understood and labels are based only on explicitly mentioned information in the sentence. Otherwise, assign the label '{unknown_label}'. + - The assigned label should be only the string that precedes the character ':'. + - Format your JSON as follows: {{"": {{"motivation": "", "label": "