From 708696b599e38fd041a693d470647833332db45b Mon Sep 17 00:00:00 2001 From: Alessandro Bouchs Date: Wed, 19 Nov 2025 18:15:06 +0000 Subject: [PATCH 1/2] initial commit for entity search and label --- .../labeler/entity_labeler.py | 404 ++++++++++++++++++ .../mindmap/mindmap_utils.py | 141 ++++++ src/bigdata_research_tools/prompts/labeler.py | 237 ++++++++++ .../search/entities_search.py | 325 ++++++++++++++ .../search/narrative_search.py | 17 + .../search/screener_search.py | 7 + 6 files changed, 1131 insertions(+) create mode 100644 src/bigdata_research_tools/labeler/entity_labeler.py create mode 100644 src/bigdata_research_tools/search/entities_search.py diff --git a/src/bigdata_research_tools/labeler/entity_labeler.py b/src/bigdata_research_tools/labeler/entity_labeler.py new file mode 100644 index 0000000..a5a9077 --- /dev/null +++ b/src/bigdata_research_tools/labeler/entity_labeler.py @@ -0,0 +1,404 @@ +from logging import Logger, getLogger +from typing import Any + +from pandas import DataFrame, Series + +from bigdata_research_tools.labeler.labeler import Labeler +from bigdata_research_tools.llm.base import LLMConfig +from bigdata_research_tools.prompts.labeler import ( + get_other_entity_placeholder, + get_entity_risk_system_prompt, + get_entity_theme_system_prompt, + get_target_entity_placeholder, +) + +logger: Logger = getLogger(__name__) + +class EntityRiskLabeler(Labeler): + def __init__( + self, + llm_model_config: str | LLMConfig | dict = "openai::gpt-4o-mini", + label_prompt: str | None = None, + # TODO (cpinto, 2025.02.07) This value is also in the prompt used. + # Changing it here would break the process. + unknown_label: str = "unclear", + ): + """ + Args: + llm_model: Name of the LLM model to use. Expected format: + ::, e.g. "openai::gpt-4o-mini" + label_prompt: Prompt provided by user to label the search result chunks. + If not provided, then our default labelling prompt is used. + unknown_label: Label for unclear classifications + """ + super().__init__(llm_model_config, unknown_label) + self.label_prompt = label_prompt + + def get_labels( + self, + main_theme: str, + labels: list[str], + texts: list[str], + max_workers: int = 50, + timeout: int | None = 55, + textsconfig: list[dict[str, Any]] | None = None, + ) -> DataFrame: + """ + Process thematic labels for texts. + + Args: + main_theme: The main theme to analyze. + labels: Labels for labelling the chunks. + texts: List of chunks to label. + timeout: Timeout for each LLM request. + max_workers: Maximum number of concurrent workers. + + Returns: + DataFrame with schema: + - index: sentence_id + - columns: + - motivation + - label + """ + system_prompt = ( + get_entity_risk_system_prompt(main_theme, labels) + if self.label_prompt is None + else self.label_prompt + ) + + prompts = self.get_prompts_for_labeler(texts, textsconfig) + + responses = self._run_labeling_prompts( + prompts, + system_prompt, + max_workers=max_workers, + timeout=timeout, + processing_callbacks=[ + self.parse_labeling_response, + self._deserialize_label_response, + ], + ) + + return self._convert_to_label_df(responses) + + def post_process_dataframe(self, df: DataFrame, extra_fields: dict, extra_columns: list[str]) -> DataFrame: + """ + Post-process the labeled DataFrame. + + Args: + df: DataFrame to process. Schema: + - Index: int + - Columns: + - timestamp_utc: datetime64 + - document_id: str + - sentence_id: str + - headline: str + - entity_id: str + - entity_name: str + - entity_country: str + - text: str + - other_entities: str + - entities: List[Dict[str, Any]] + - key: str + - name: str + - start: int + - end: int + - masked_text: str + - other_entities_map: List[Tuple[int, str]] + - label: str + - motivation: str + Returns: + Processed DataFrame. Schema: + - index: int + - Columns: + - Time Period + - Date + - Entity + - Country + - Document ID + - Headline + - Quote + - Motivation + - Theme + - Sentiment + """ + # Filter unlabeled sentences + df = df.loc[df["label"] != "unclear"].copy() + if df.empty: + print(f"Empty dataframe: all rows labelled unclear") + return df + + # Process timestamps + df["timestamp_utc"] = df["timestamp_utc"].dt.tz_localize(None) + + # Sort and format + sort_columns = ["entity_name", "timestamp_utc", "label"] + df = df.sort_values(by=sort_columns).reset_index(drop=True) + + # Replace company placeholders + df["motivation"] = df.apply(replace_company_placeholders, axis=1) + + # Add formatted columns + df["Time Period"] = df["timestamp_utc"].dt.strftime("%b %Y") + df["Date"] = df["timestamp_utc"].dt.strftime("%Y-%m-%d") + + df["Document ID"] = df["document_id"] if "document_id" in df.columns else df["rp_document_id"] + + columns_map = { + "entity_name": "Entity", + "entity_country": "Country", + "headline": "Headline", + "text": "Quote", + "bigdata_sentiment": "Bigdata Sentiment", + "sentiment": "Sentiment", + "motivation": "Motivation", + "label": "Sub-Scenario", + "other_entities_name": "Other Entities", + "other_entities_id": "Other Entities IDs", + "other_entities_type": "Other Entities Types", + } + + if 'entity_sentiment' in df.columns: + columns_map.update({ + "entity_sentiment": "Entity Sentiment", + "entity_text_sentiment": "Entity Text Sentiment" + }) + + if extra_fields: + columns_map.update(extra_fields) + if "quotes" in extra_fields.keys(): + if "quotes" in df.columns: + df["quotes"] = df.apply(replace_company_placeholders, axis=1, col_name = 'quotes') + else: + print("quotes column not in df") + + df = df.rename( + columns=columns_map + ) + + # Select and order columns + export_columns = [ + "Time Period", + "Date", + "Entity", + "Country", + "Document ID", + "Headline", + "Quote", + "Sentiment", + "Bigdata Sentiment", + "Motivation", + "Sub-Scenario", + "Other Entities", + "Other Entities IDs", + "Other Entities Types" + ] + + if 'Entity Sentiment' in df.columns: + print("Including entity sentiment columns in export") + export_columns += ["Entity Sentiment", "Entity Text Sentiment"] + + if extra_columns: + export_columns += extra_columns + + return df[export_columns] + +class EntityScreenerLabeler(Labeler): + def __init__( + self, + llm_model_config: str | LLMConfig | dict = "openai::gpt-4o-mini", + label_prompt: str | None = None, + # TODO (cpinto, 2025.02.07) This value is also in the prompt used. + # Changing it here would break the process. + unknown_label: str = "unclear", + ): + """ + Args: + llm_model: Name of the LLM model to use. Expected format: + ::, e.g. "openai::gpt-4o-mini" + label_prompt: Prompt provided by user to label the search result chunks. + If not provided, then our default labelling prompt is used. + unknown_label: Label for unclear classifications + """ + super().__init__(llm_model_config, unknown_label) + self.label_prompt = label_prompt + + def get_labels( + self, + main_theme: str, + labels: list[str], + texts: list[str], + max_workers: int = 50, + timeout: int | None = 55, + textsconfig: list[dict[str, Any]] | None = None, + ) -> DataFrame: + """ + Process thematic labels for texts. + + Args: + main_theme: The main theme to analyze. + labels: Labels for labelling the chunks. + texts: List of chunks to label. + timeout: Timeout for each LLM request. + max_workers: Maximum number of concurrent workers. + + Returns: + DataFrame with schema: + - index: sentence_id + - columns: + - motivation + - label + """ + system_prompt = ( + get_entity_theme_system_prompt(main_theme, labels) + if self.label_prompt is None + else self.label_prompt + ) + + prompts = self.get_prompts_for_labeler(texts, textsconfig) + + responses = self._run_labeling_prompts( + prompts, + system_prompt, + max_workers=max_workers, + timeout=timeout, + processing_callbacks=[ + self.parse_labeling_response, + self._deserialize_label_response, + ], + ) + + return self._convert_to_label_df(responses) + + def post_process_dataframe(self, df: DataFrame, extra_fields: dict, extra_columns: list[str]) -> DataFrame: + """ + Post-process the labeled DataFrame. + + Args: + df: DataFrame to process. Schema: + - Index: int + - Columns: + - timestamp_utc: datetime64 + - document_id: str + - sentence_id: str + - headline: str + - entity_id: str + - entity_name: str + - entity_country: str + - text: str + - other_entities: str + - entities: List[Dict[str, Any]] + - key: str + - name: str + - start: int + - end: int + - masked_text: str + - other_entities_map: List[Tuple[int, str]] + - label: str + - motivation: str + Returns: + Processed DataFrame. Schema: + - index: int + - Columns: + - Time Period + - Date + - Entity + - Country + - Document ID + - Headline + - Quote + - Motivation + - Theme + - Sentiment + """ + # Filter unlabeled sentences + df = df.loc[df["label"] != "unclear"].copy() + if df.empty: + print(f"Empty dataframe: all rows labelled unclear") + return df + + # Process timestamps + df["timestamp_utc"] = df["timestamp_utc"].dt.tz_localize(None) + + # Sort and format + sort_columns = ["entity_name", "timestamp_utc", "label"] + df = df.sort_values(by=sort_columns).reset_index(drop=True) + + # Replace company placeholders + df["motivation"] = df.apply(replace_company_placeholders, axis=1) + + # Add formatted columns + df["Time Period"] = df["timestamp_utc"].dt.strftime("%b %Y") + df["Date"] = df["timestamp_utc"].dt.strftime("%Y-%m-%d") + + df["Document ID"] = df["document_id"] if "document_id" in df.columns else df["rp_document_id"] + + columns_map = { + "entity_name": "Entity", + "entity_country": "Country", + "headline": "Headline", + "text": "Quote", + "bigdata_sentiment": "Bigdata Sentiment", + "sentiment": "Sentiment", + "motivation": "Motivation", + "label": "Theme", + "other_entities_name": "Other Entities", + "other_entities_id": "Other Entities IDs", + "other_entities_type": "Other Entities Types", + } + + if extra_fields: + columns_map.update(extra_fields) + if "quotes" in extra_fields.keys(): + if "quotes" in df.columns: + df["quotes"] = df.apply(replace_company_placeholders, axis=1, col_name = 'quotes') + else: + print("quotes column not in df") + + df = df.rename( + columns=columns_map + ) + + # Select and order columns + export_columns = [ + "Time Period", + "Date", + "Entity", + "Country", + "Document ID", + "Headline", + "Quote", + "Sentiment", + "Bigdata Sentiment", + "Motivation", + "Theme", + "Other Entities", + "Other Entities IDs", + "Other Entities Types" + ] + + if extra_columns: + export_columns += extra_columns + + return df[export_columns] + +def replace_company_placeholders(row: Series) -> str: + """ + Replace company placeholders in text. + + Args: + row: Row of the DataFrame. Expected columns: + - motivation: str + - entity_name: str + - other_entities_map: List[Tuple[int, str]] + Returns: + Text with placeholders replaced. + """ + text = row["motivation"] + text = text.replace(get_target_entity_placeholder(), row["entity_name"]) + if row.get("other_entities_map"): + for entity_id, entity_name in row["other_entities_map"]: + text = text.replace( + f"{get_other_entity_placeholder()}_{entity_id}", entity_name + ) + return text \ No newline at end of file diff --git a/src/bigdata_research_tools/mindmap/mindmap_utils.py b/src/bigdata_research_tools/mindmap/mindmap_utils.py index 0f8e25b..0a0756f 100644 --- a/src/bigdata_research_tools/mindmap/mindmap_utils.py +++ b/src/bigdata_research_tools/mindmap/mindmap_utils.py @@ -148,6 +148,147 @@ """ ), }, + "risk_entity": { + "qualifier": "Risk Scenario", + "user_prompt_message": "Your given Risk Scenario is: {main_theme}", + "default_instructions": ( + "Forget all previous prompts." + "You are assisting a professional risk analyst tasked with creating a taxonomy to classify the impact of the Risk Scenario '**{main_theme}**' on other entities, such countries, commodities, geographical places, and organizations." + "Your objective is to generate a **comprehensive tree structure** that maps the **risk spillovers** stemming from the Risk Scenario '**{main_theme}**', and generates related sub-scenarios. " + "Key Instructions:" + "1. **Understand the Risk Scenario: '{main_theme}'**:" + " - The Risk Scenario '**{main_theme}**' represents a central, multifaceted concept that may be harmful or beneficial to an entity." + " - Your task is to identify how the Risk Scenario impacts entities through various **risk spillovers** and transmission channels." + " - Summarize the Risk Scenario '**{main_theme}**' in a **short list of essential keywords**." + " - The keyword list should be short (1-2 keywords). Avoid unnecessary, unmentioned, indirectly inferred, or redundant keywords." + "2. **Create a Tree Structure for Risk Spillovers and Sub-Scenarios**:" + " - Decompose the Risk Scenario into **distinct, focused, and self-contained risk spillovers**." + " - Each risk spillover must represent a **specific risk channel** through which entities are exposed to as a consequence of the Risk Scenario." + " - Label each **primary node** in the tree explicitly as a \"Risk\" in the `Label` field. For example:" + " - Use 'Cost Risk' instead of 'Cost Impacts'." + " - Use 'Supply Chain Risk' instead of 'Supply Chain Disruptions'." + " - Risk spillovers must:" + " - Cover a wide range of potential impacts on entities' and long-term stabiliity and growth." + " - Explore both macroeconomic and microeconomic dimensions of the Risk Scenario '**{main_theme}**' and analyze their impact on entities when relevant." + " - Include **direct and indirect consequences** of the main scenario." + " - Represent **dimensions of risk** that entities must monitor or mitigate." + " - NOT overlap." + " - Independently identify the most relevant spillovers based on the Risk Scenario '**{main_theme}**', without limiting to predefined categories." + "3. **Generate Sub-Scenarios for Each Risk Spillover**:" + " - For each risk spillover, identify **specific sub-scenarios** that will arise as a consequence of the Risk Scenario '**{main_theme}**'." + " - All sub-scenarios must:" + " - Be **concise and descriptive sentences**, clearly stating how the sub-scenario is an event caused by the main scenario." + " - **Explicitly include ALL core concepts and keywords** from the main scenario, including specific geographical locations or temporal details, in every sentence in order to ensure clarity and relevance towards the main scenario." + " - Integrate the Risk Scenario in a natural way, avoiding repetitive or mechanical structures." + " - Not exceed 15 words." + " - Sub-scenarios MUST be mutually exclusive: they CANNOT overlap neither within nor across branches of the tree." + " - Do NOT combine multiple sub-scenarios in a single label." + " - Sub-Scenarios have to be consistent with the parent Risk Spillover (e.g. Market Access related sub-scenarios have to belong to the Market Access Risk node)." + " - Generate 3 OR MORE sub-scenarios for each risk spillover." + " - Generate a short label for each subscenario." + "4. **Iterate Based on the Analyst's Focus: '{analyst_focus}'**:" + " - After generating the initial tree structure, use the analyst's focus ('{analyst_focus}') to:" + " - Identify **missing branches** or underexplored areas of the tree." + " - Add new risk spillovers or sub-scenarios that align with the analyst's focus." + " - Ensure that sub-scenarios ALWAYS include ALL core components of the Risk Scenario and are formulated as natural sentences." + " - Ensure that sub-scenarios DO NOT overlap within and across risk spillovers." + " - Ensure that sub-scenarios belong to the correct Risk Spillover." + " - If the analyst focus is empty, skip this step." + " - If you don't understand the analyst focus ('{analyst_focus}'), ask an open-ended question to the analyst." + "5. **Review and Expand the Tree for Missing Risks**:" + " - After incorporating the analyst's focus, review the tree structure to ensure it includes a **broad range of risks** and sub-scenarios." + " - Add any missing risks or sub-scenarios to the tree." + ), + "enforce_structure_string": ( + """IMPORTANT: Your response MUST be a valid JSON object. Each node in the JSON object must include:\n" + " - `node`: an integer representing the unique identifier for the node.\n" + " - `label`: a string for the name of the sub-theme.\n" + " - `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the main theme or risk.\n" + " - `children`: an array of child nodes.\n" + "Format the JSON object as a nested dictionary. Be careful when specifying keys and items.\n" + "Avoid overlapping labels. Break down joint concepts into unique parents so that each parent represents ONLY ONE concept. AVOID creating branch names such as 'Compliance and Regulatory Risk'. Keep risks separate and create a single branch for each risk, such as 'Compliance Risk' and 'Regulatory Risk', each with their own children.\n" + "Return ONLY the JSON object, with no extra text, explanation, or markdown.\n" + "You MUST use ONLY these field names: label, node, summary, children. Do NOT use underscores, spaces, or any other characters in field names. If you use any other field names, your answer will be rejected.\n" + "## Example Structure:\n" + "**Theme: Global Warming**\n\n" + "{\n" + " \"node\": 1,\n" + " \"label\": \"Global Warming\",\n" + " \"summary\": \"Global Warming is a serious risk\",\n" + " \"children\": [\n" + " {\"node\": 2, \"label\": \"Renewable Energy Adoption\", \"summary\": \"Renewable energy reduces greenhouse gas emissions and thereby global warming and climate change effects\", \"children\": [\n" + " {\"node\": 5, \"label\": \"Solar Energy\", \"summary\": \"Solar energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 6, \"label\": \"Wind Energy\", \"summary\": \"Wind energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 7, \"label\": \"Hydropower\", \"summary\": \"Hydropower reduces greenhouse gas emissions\"}\n" + " ]},\n" + " {\"node\": 3, \"label\": \"Carbon Emission Reduction\", \"summary\": \"Carbon emission reduction decreases greenhouse gases\", \"children\": [\n" + " {\"node\": 8, \"label\": \"Carbon Capture Technology\", \"summary\": \"Carbon capture technology reduces atmospheric CO2\"},\n" + " {\"node\": 9, \"label\": \"Emission Trading Systems\", \"summary\": \"Emission trading systems incentivize reductions in greenhouse gases\"}\n" + " ]}\n" + " ]\n" + "}\n" + """ + ), + }, + "theme_entity": { + "qualifier": "Main Theme", + "user_prompt_message": "Your given Theme is: {main_theme}", + "default_instructions": ( + "Forget all previous prompts." + "You are assisting a professional analyst tasked with creating a screener to measure the impact of the theme {main_theme} on other entities, such countries, commodities, geographical places, and organizations." + "Your objective is to generate a comprehensive tree structure of distinct sub-themes that will guide the analyst's research process." + "Follow these steps strictly:" + "1. **Understand the Core Theme {main_theme}**:" + " - The theme {main_theme} is a central concept. All components are essential for a thorough understanding." + "2. **Create a Taxonomy of Sub-themes for {main_theme}**:" + " - Decompose the main theme {main_theme} into concise, focused, and self-contained sub-themes." + " - Each sub-theme should represent a singular, concise, informative, and clear aspect of the main theme." + " - Expand the sub-theme to be relevant for the {main_theme}: a single word is not informative enough." + " - Prioritize clarity and specificity in your sub-themes." + " - Avoid repetition and strive for diverse angles of exploration." + " - Provide a comprehensive list of potential sub-themes." + "3. **Iterate Based on the Analyst's Focus {analyst_focus}**:" + " - If no specific {analyst_focus} is provided, transition directly to formatting the JSON response." + "4. **Format Your Response as a JSON Object**:" + " - Each node in the JSON object must include:" + " - `node`: an integer representing the unique identifier for the node." + " - `label`: a string for the name of the sub-theme." + " - `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the theme {main_theme}." + " - For the node referring to the first node {main_theme}, just define briefly in maximum 15 words the theme {main_theme}." + " - `children`: an array of child nodes." + ), + "enforce_structure_string": ( + """IMPORTANT: Your response MUST be a valid JSON object. Each node in the JSON object must include:\n" + "- `node`: an integer representing the unique identifier for the node.\n" + "- `label`: a string for the name of the sub-theme.\n" + "- `summary`: a string to explain briefly in maximum 15 words why the sub-theme is related to the theme.\n" + "- For the node referring to the main theme, just define briefly in maximum 15 words the theme.\n" + "- `children`: an array of child nodes.\n" + "Format the JSON object as a nested dictionary. Be careful when specifying keys and items.\n" + "Avoid overlapping labels. Break down joint concepts into unique parents so that each parent represents ONLY ONE concept. AVOID creating branch names such as 'Compliance and Regulatory Risk'. Keep risks separate and create a single branch for each risk, such as 'Compliance Risk' and 'Regulatory Risk', each with their own children.\n" + "Return ONLY the JSON object, with no extra text, explanation, or markdown.\n" + "You MUST use ONLY these field names: label, node, summary, children. Do NOT use underscores, spaces, or any other characters in field names. If you use any other field names, your answer will be rejected.\n" + "## Example Structure:\n" + "**Theme: Global Warming**\n\n" + "{\n" + " \"node\": 1,\n" + " \"label\": \"Global Warming\",\n" + " \"summary\": \"Global Warming is a serious risk\",\n" + " \"children\": [\n" + " {\"node\": 2, \"label\": \"Renewable Energy Adoption\", \"summary\": \"Renewable energy reduces greenhouse gas emissions and thereby global warming and climate change effects\", \"children\": [\n" + " {\"node\": 5, \"label\": \"Solar Energy\", \"summary\": \"Solar energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 6, \"label\": \"Wind Energy\", \"summary\": \"Wind energy reduces greenhouse gas emissions\"},\n" + " {\"node\": 7, \"label\": \"Hydropower\", \"summary\": \"Hydropower reduces greenhouse gas emissions\"}\n" + " ]},\n" + " {\"node\": 3, \"label\": \"Carbon Emission Reduction\", \"summary\": \"Carbon emission reduction decreases greenhouse gases\", \"children\": [\n" + " {\"node\": 8, \"label\": \"Carbon Capture Technology\", \"summary\": \"Carbon capture technology reduces atmospheric CO2\"},\n" + " {\"node\": 9, \"label\": \"Emission Trading Systems\", \"summary\": \"Emission trading systems incentivize reductions in greenhouse gases\"}\n" + " ]}\n" + " ]\n" + "}\n" + """ + ), + }, } diff --git a/src/bigdata_research_tools/prompts/labeler.py b/src/bigdata_research_tools/prompts/labeler.py index f7ae4f0..a3181b3 100644 --- a/src/bigdata_research_tools/prompts/labeler.py +++ b/src/bigdata_research_tools/prompts/labeler.py @@ -314,3 +314,240 @@ def get_risk_system_prompt(main_theme: str, label_summaries: list[str]) -> str: return risk_system_prompt_template.format( main_theme=main_theme, label_summaries=label_summaries ) + +def get_entity_risk_system_prompt(main_theme: str, label_summaries: list) -> str: + """ + Generate the risk entity labeler prompt with the provided parameters. + + Args: + main_theme (str): The main risk theme being analyzed + label_summaries (list): List of risk sub-scenario summaries + + Returns: + str: The formatted prompt string + """ + return entity_risk_system_prompt_template.format( + main_theme=main_theme, label_summaries=label_summaries, BIGDATA_TARGET_ENTITY_PLACEHOLDER = get_target_entity_placeholder(), BIGDATA_OTHER_ENTITY_PLACEHOLDER = get_other_entity_placeholder() + ) + +entity_risk_system_prompt_template: str = """Forget all previous prompts. + +You are assisting a professional analyst in evaluating both the exposure and risk classification for "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" regarding the Risk Scenario "{main_theme}". +This involves a two-step process: confirming exposure of "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" and classifying specific risks if exposure is confirmed. Use the headline for contextual understanding. + + +You will receive the following information:: +- ID: [text ID] +- Headline: [The Headline of the News Article containing Text] +- Text: [Paragraph requiring analysis] +- Risk Scenario: "{main_theme}" + + +Follow these guidelines: + + +- Examine whether the text explicitly mentions the Risk Scenario "{main_theme}" or any of its core components. +- Ensure that "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is the main focus of the text and that it is clearly stated that "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is facing or will face consequences caused by the Risk Scenario "{main_theme}". +- Assess if there are DIRECT consequences on "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}’s" internal and external activities, operations, future performance, stability, sustainability, and long term growth. +- Designate the exposure as unclear if the text lacks an explicit DIRECT link between "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" and the Risk Scenario +- Designate the exposure as unclear if the text relies on generic information. + + + +If direct exposure of {BIGDATA_TARGET_ENTITY_PLACEHOLDER} is confirmed: + +- Identify and classify the specific risk using this list of Risk Sub-Scenarios: + "{label_summaries}". + +- Follow a detailed classification process: + - Examine the text to confirm how the Risk Scenario "{main_theme}" directly impacts "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" through one of the Risk Sub-Scenarios. + - Write a concise motivation that explains the direct link between "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" and the Risk Sub-Scenario as stated in the text. + - The motivation should always start with "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}". + - Identify an appropriate Risk Sub-Scenario label from the list that describes explicitly the impact on {BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s internal and external activities, operations, stability, sustainability, and long term growth or performance. + - Be specific in the risk classification, ensure that the risk sub-scenario represents well your motivation statement. + - Ensure that the Risk Sub-Scenario label can be directly extracted from the text that it describes with high granularity how "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is affected. + - Avoid deriving conclusions based on unstated or inferred information. Focus only on the explicit content of the text or headline. + + + +- Extract verbatim quotes from the text that support the classification and illustrate {BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s exposure to the specific Risk Sub-Scenario. +- Ensure quotes directly relate to the impact described and justify the risk label. +- Extract full sentences or phrases that clearly indicate, as standalone statements, how "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is affected by the Risk Scenario "{main_theme}" and the Sub-Scenario label assigned. + + + +- If the text does explicitly link "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" with the Risk Scenario "{main_theme}", classify the exposure with a sentiment label speficied as follows: + - "negative" if the text indicates that "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is facing or will face negative consequences due to the Risk Scenario "{main_theme}". + - "positive" if the text indicates that "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is well positioned in the face of the Risk Scenario "{main_theme}", or is in a better position with respect to the past, intended as previous occurrences of the Risk Scenario from which the situation has improved, or if the text indicates that it is doing better than its peers, or than the {BIGDATA_OTHER_ENTITY_PLACEHOLDER}. + - "neutral" if the text indicates that "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" is neither positively nor negatively affected by the Risk Scenario "{main_theme}". +- If the exposure is unclear, assign the sentiment label as "neutral". + + + +Structure your response as a JSON object with the sentence ID as the key containing: +"motivation": A concise explanation describing the link between "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" and the Risk Sub-Scenario. +"label": State the specific risk Sub-Scenario label or 'unclear'. +"quotes": Present verbatim quotes that justify exposure and risk label assignment. +"sentiment": State the sentiment label as 'negative', 'positive', or 'neutral'. + +Format: {{"": +{{"motivation": "", "label": "", + "quotes": "", "sentiment": ""}} +}}. + + + +ID: 3 +Headline: "Tariffs to Strain Supply Chains Globally" +Text: "New tariffs against China will significantly impact {BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s operations due to its reliance on raw materials from Chinese suppliers." +Scenario: "New Tariffs against China" +Output: + +{{3:{{ + "motivation": "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s supply operations are directly impacted by new tariffs due to their reliance on raw materials sourced from China.", + "label": "Supply Chain Disruption", + "quotes": ["New tariffs against China will significantly impact {BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s operations", "reliance on raw materials from Chinese suppliers"], + "sentiment": "negative"}} +}} + +ID: 5 +Headline: "Interest Rate Fluctuations to Affect Markets" +Text: "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s analysts are forecasting higher risks associated with potential interest rate changes." +Scenario: "Interest Rate Volatility" +Output: + +{{5:{{ + "motivation": "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s analysts are forecasting higher risks associated with potential interest rate changes.", + "label": "unclear", + "quotes": [], + "sentiment": "neutral"}} +}} + +ID: 2 +Headline: "Economic Challenges Ahead Due to Tariffs on China" +Text: "{BIGDATA_OTHER_ENTITY_PLACEHOLDER}'s analysts report a potential economic downturn in {BIGDATA_TARGET_ENTITY_PLACEHOLDER} linked to new tariffs against China." +Risk Scenario: "New Tariffs Against China" +Output: + +{{2:{{ + "motivation": "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}'s analysts are assessing the potential economic impact of new tariffs against China.", + "label": "Economic Downturns", + "quotes": ["{BIGDATA_OTHER_ENTITY_PLACEHOLDER}'s analysts report a potential economic downturn in {BIGDATA_TARGET_ENTITY_PLACEHOLDER}"], + "sentiment": "negative"}} +}} + +ID: 3 +Headline: "Analyzing External Factors in Business Strategy" +Text: "{BIGDATA_OTHER_ENTITY_PLACEHOLDER} is studying external factors such as tariffs to gauge potential risks. {BIGDATA_OTHER_ENTITY_PLACEHOLDER}'s analysts report a potential economic downturn in {BIGDATA_TARGET_ENTITY_PLACEHOLDER}." +Risk Scenario: "New Tariffs on Semiconductors" +Output: + +{{3:{{ + "motivation": "{BIGDATA_OTHER_ENTITY_PLACEHOLDER}'s analysis of external factors does not establish a direct link to {BIGDATA_TARGET_ENTITY_PLACEHOLDER}.", + "label": "unclear", + "quotes": [], + "sentiment": "neutral"}} +}} + +ID: 4 +Headline: "Market Trends Influence Stock Performance" +Text: "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}’s stock is influenced by broad market trends." +Risk Scenario: "Increased Uncertainty and Volatility" +Output: + +{{4:{{ + "motivation": "The text does not related to the Risk Scenario and it does not mention any specific risk sub-scenario affecting {BIGDATA_TARGET_ENTITY_PLACEHOLDER}.", + "label": "unclear", + "quotes": [], + "sentiment": "neutral"}} +}} + +ID: 5 + +Headline: "Tariffs and Their Economic Impact" +Text: "{BIGDATA_OTHER_ENTITY_PLACEHOLDER} researchers estimate that tariffs will affect the broader economy in {BIGDATA_TARGET_ENTITY_PLACEHOLDER}." +Risk Scenario: "New Tariffs against China" +Output: + +{{5:{{ + "motivation": "{BIGDATA_TARGET_ENTITY_PLACEHOLDER} is not linked with any specific risk sub-scenario or any tangible effect of the Risk Scenario.", + "label": "unclear", + "quotes": [], + "sentiment": "neutral"}} +}} + +ID: 2 +Headline: "China Tariffs Impact Supply Chains" +Text: "According to recent reports, {BIGDATA_TARGET_ENTITY_PLACEHOLDER} is heavily dependent on China. The recent tariffs against China have forced {BIGDATA_TARGET_ENTITY_PLACEHOLDER} to reconsider its supply chain, potentially leading to increased logistics costs." +Risk Scenario: "New Tariffs against China" +Output: + +{{2:{{ + "motivation": "{BIGDATA_TARGET_ENTITY_PLACEHOLDER} is said to be reconsidering its supply chain in the face of the risk scenario. The text clearly links {BIGDATA_TARGET_ENTITY_PLACEHOLDER} with the Risk Scenario and mentions an explicit Sub-scenario risk of Supply Chain Disruptions.", + "label": "Supply Chain Disruption", + "quotes": [ + "{BIGDATA_TARGET_ENTITY_PLACEHOLDER} is heavily dependent on China", + "The recent tariffs against China have forced {BIGDATA_TARGET_ENTITY_PLACEHOLDER} to reconsider its supply chain, potentially leading to increased logistics costs." + ], + "sentiment": "negative"}} +}} + +""" + +def get_entity_theme_system_prompt(main_theme: str, label_summaries: list) -> str: + """ + Generate the entity screener labeler prompt with the provided parameters. + + Args: + main_theme (str): The main theme being analyzed + label_summaries (list): List of risk theme summaries + + Returns: + str: The formatted prompt string + """ + return entity_theme_system_prompt_template.format( + main_theme=main_theme, label_summaries=label_summaries, BIGDATA_TARGET_ENTITY_PLACEHOLDER = get_target_entity_placeholder() + ) + +entity_theme_system_prompt_template: str = """ + Forget all previous prompts. + You are assisting a professional analyst in evaluating the impact of the theme '{main_theme}' on an entity "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}". + Your primary task is first, to ensure that each sentence is explicitly related to '{main_theme}', and second, to accurately associate each given sentence with + the relevant label contained within the list '{label_summaries}'. + + Please adhere strictly to the following guidelines: + + 1. **Analyze the Sentence**: + - Each input consists of a sentence ID, an entity name ('{BIGDATA_TARGET_ENTITY_PLACEHOLDER}'), and the sentence text. + - Analyze the sentence to understand if the content clearly establishes a connection to '{main_theme}'. + - Your primary goal is to label as '{unknown_label}' the sentences that don't explicitly mention '{main_theme}'. + - Analyze the list of labels '{label_summaries}' used for label assignment. '{label_summaries}' is a Python list variable containing distinct labels and their definition in format 'Label: Summary', you must pick label only from 'Label' part which means left side of the semicolon for each Label:Summary pair. + - Your secondary goal is to select the most appropriate label from '{label_summaries}' that corresponds to the content of the sentence. + + 2. **First Label Assignment**: + - Assign the label '{unknown_label}' to the sentence related to "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" when it does not explicitly mentions '{main_theme}'. Otherwise, don't assign a label. + - Evaluate each sentence independently, focusing solely on the context provided within that specific sentence. + - Use only the information contained within the sentence for your label assignment. + - When evaluating the sentence, "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" must clearly mention that the entity is clearly impacted by '{main_theme}'. + - Many sentences are only tangentially connected to the topic '{main_theme}'. These sentences must be assigned the label '{unknown_label}'. + + 3. **Second Label Assignment**: + - For the sentences not labeled as '{unknown_label}' and only for them, assign a unique label from the list '{label_summaries}' to the sentence related to "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}". + - Evaluate each sentence independently, focusing solely on the context provided within that specific sentence. + - Use only the information contained within the sentence for your label assignment. + - Ensure that the sentence clearly establishes a connection to the label you assigned and to the theme '{main_theme}'. + - You must not create a new label or choose a label that is not present in '{label_summaries}'. + - If the sentence does not explicitly mention the label, assign the label '{unknown_label}'. + - When evaluating the sentence, "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}" must clearly mention that the entity is impacted by the label assigned and '{main_theme}'. + + 4. **Response Format**: + - Your output should be structured as a JSON object that includes: + 1. A brief motivation for your choice. + 2. The assigned label. + - Each entry must start with the sentence ID and contain a clear motivation that begins with "{BIGDATA_TARGET_ENTITY_PLACEHOLDER}". + - The motivation should explain why the label was selected from '{label_summaries}' based on the information in the sentence and in the context of '{main_theme}'. It should also justify the label that had been assigned. + - Ensure that the exact context is understood and labels are based only on explicitly mentioned information in the sentence. Otherwise, assign the label '{unknown_label}'. + - The assigned label should be only the string that precedes the character ':'. + - Format your JSON as follows: {{"": {{"motivation": "", "label": "