From 2151ec78c44c5a023e205cf2bfb0dc208858ee3c Mon Sep 17 00:00:00 2001
From: chenziqi66 <1304114564@qq.com>
Date: Mon, 30 Mar 2026 11:07:49 +0800
Subject: [PATCH] =?UTF-8?q?=E8=BF=99=E6=98=AF=E4=B8=80=E4=B8=AA=E4=BD=BF?=
 =?UTF-8?q?=E7=94=A8=E6=9C=BA=E5=99=A8=E5=AD=A6=E4=B9=A0=E6=8A=80=E6=9C=AF?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=82=A1=E7=A5=A8=E9=A2=84=E6=B5=8B=E7=9A=84?=
 =?UTF-8?q?=E5=85=A5=E9=97=A8=E9=A1=B9=E7=9B=AE=E3=80=82=E5=AE=83=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9B=E4=BA=86=E4=B8=80=E4=B8=AA=E5=AE=8C=E6=95=B4=E7=9A=84?=
 =?UTF-8?q?=E6=A1=86=E6=9E=B6=EF=BC=8C=E7=94=A8=E4=BA=8E=E9=80=9A=E8=BF=87?=
 =?UTF-8?q?=E5=88=86=E6=9E=90=E8=82=A1=E7=A5=A8=E7=9A=84=E5=9F=BA=E6=9C=AC?=
 =?UTF-8?q?=E9=9D=A2=E6=95=B0=E6=8D=AE=E6=9D=A5=E9=A2=84=E6=B5=8B=E8=82=A1?=
 =?UTF-8?q?=E7=A5=A8=E4=BB=B7=E6=A0=BC=E8=B5=B0=E5=8A=BF=E3=80=82=20?=
 =?UTF-8?q?=E6=88=91=E5=8F=91=E7=8E=B0=E4=BA=86=E4=B8=8B=E9=9D=A2=E7=9A=84?=
 =?UTF-8?q?=E5=87=A0=E4=B8=AAbug=EF=BC=8C=E8=AF=B7=E5=B8=AE=E6=88=91?=
 =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20Bug=201:=20parsing=5Fkeystats.py=20-=20pan?=
 =?UTF-8?q?das=20append=E5=BC=83=E7=94=A8=20Bug=202:=20=E7=89=B9=E5=BE=81?=
 =?UTF-8?q?=E5=90=8D=E4=B8=8D=E5=AE=8C=E6=95=B4/=E4=B8=8D=E4=B8=80?=
 =?UTF-8?q?=E8=87=B4=20Bug=203:=20data=5Fstring=5Fto=5Ffloat=20=E5=A4=A7?=
 =?UTF-8?q?=E5=B0=8F=E5=86=99=E6=95=8F=E6=84=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

约束条件：
不要修改原本代码的逻辑
生成测试文档
---
 BUGFIXES.md            | 136 +++++++++++++++++++++++++++++++++++++++++
 parsing_keystats.py    |   9 +--
 tests/test_datasets.py |  31 ++++++++++
 tests/test_utils.py    |  31 +++++++++-
 utils.py               |  15 ++---
 5 files changed, 209 insertions(+), 13 deletions(-)
 create mode 100644 BUGFIXES.md

diff --git a/BUGFIXES.md b/BUGFIXES.md
new file mode 100644
index 00000000..9c3ca42a
--- /dev/null
+++ b/BUGFIXES.md
@@ -0,0 +1,136 @@
+# Bug修复文档
+
+本文档记录了项目中发现的Bug及其修复方案。
+
+## Bug 1: parsing_keystats.py - pandas append弃用
+
+### 问题描述
+`pandas.DataFrame.append()` 方法在新版本的pandas中已被弃用，并将在未来版本中移除。使用该方法会导致`FutureWarning`或`DeprecationWarning`。
+
+### 修复方案
+将 `df.append()` 替换为使用列表收集行数据，最后通过 `pd.DataFrame()` 创建DataFrame的方式。
+
+### 代码变更
+**文件**: `parsing_keystats.py`
+
+```python
+# 修复前
+df = pd.DataFrame(columns=df_columns)
+...
+df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
+
+# 修复后
+df_rows = []
+...
+df_rows.append(dict(zip(df_columns, new_df_row)))
+...
+df = pd.DataFrame(df_rows, columns=df_columns)
+```
+
+### 测试用例
+测试文件: `tests/test_datasets.py`
+- `test_pandas_append_deprecated_fix()`: 验证代码不再使用已弃用的`append`方法
+
+---
+
+## Bug 2: 特征名不完整/不一致
+
+### 问题描述
+`parsing_keystats.py`中的特征列表包含两个不完整的特征名：
+- `"Shares Short (as of"` - 缺少右括号
+- `"Shares Short (prior month"` - 缺少右括号
+
+这会导致正则表达式匹配失败或数据解析不正确。
+
+### 修复方案
+为这两个特征名添加缺失的右括号。
+
+### 代码变更
+**文件**: `parsing_keystats.py`
+
+```python
+# 修复前
+"Shares Short (as of",
+...
+"Shares Short (prior month",
+
+# 修复后
+"Shares Short (as of)",
+...
+"Shares Short (prior month)",
+```
+
+### 测试用例
+测试文件: `tests/test_datasets.py`
+- `test_features_names_complete()`: 验证所有特征名的括号都是平衡的
+
+---
+
+## Bug 3: data_string_to_float 大小写敏感
+
+### 问题描述
+`utils.py`中的`data_string_to_float`函数在处理数字单位后缀（K、M、B）和NaN值时是大小写敏感的。这会导致：
+- `"10k"` 无法被正确识别为10000
+- `"nan"` 或 `"NAN"` 无法被识别为N/A
+
+### 修复方案
+将输入字符串转换为大写后再进行匹配处理。
+
+### 代码变更
+**文件**: `utils.py`
+
+```python
+# 修复前
+def data_string_to_float(number_string):
+    if ("N/A" in number_string) or ("NaN" in number_string):
+        return "N/A"
+    elif "B" in number_string:
+        return float(number_string.replace("B", "")) * 1000000000
+    elif "M" in number_string:
+        return float(number_string.replace("M", "")) * 1000000
+    elif "K" in number_string:
+        return float(number_string.replace("K", "")) * 1000
+
+# 修复后
+def data_string_to_float(number_string):
+    number_string_upper = number_string.upper()
+    if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
+        return "N/A"
+    elif "B" in number_string_upper:
+        return float(number_string_upper.replace("B", "")) * 1000000000
+    elif "M" in number_string_upper:
+        return float(number_string_upper.replace("M", "")) * 1000000
+    elif "K" in number_string_upper:
+        return float(number_string_upper.replace("K", "")) * 1000
+```
+
+### 测试用例
+测试文件: `tests/test_utils.py`
+- `test_data_string_to_float_case_insensitive()`: 验证大小写不敏感的处理
+  - 测试小写单位: `10k`, `5m`, `2.5b`
+  - 测试混合大小写: `10K` vs `10k`
+  - 测试负值与小写单位: `-100.1k`, `-0.1m`, `-0.02b`
+  - 测试NaN变体: `nan`, `NaN`, `NAN`, `n/a`, `N/a`
+
+---
+
+## 运行测试
+
+要运行所有测试，请执行：
+
+```bash
+pytest tests/
+```
+
+要运行特定的Bug修复测试：
+
+```bash
+# Bug 1 测试
+pytest tests/test_datasets.py::test_pandas_append_deprecated_fix
+
+# Bug 2 测试
+pytest tests/test_datasets.py::test_features_names_complete
+
+# Bug 3 测试
+pytest tests/test_utils.py::test_data_string_to_float_case_insensitive
+```
diff --git a/parsing_keystats.py b/parsing_keystats.py
index ec12da51..20a3e075 100644
--- a/parsing_keystats.py
+++ b/parsing_keystats.py
@@ -51,10 +51,10 @@
     "Float",
     "% Held by Insiders",
     "% Held by Institutions",
-    "Shares Short (as of",
+    "Shares Short (as of)",
     "Short Ratio",
     "Short % of Float",
-    "Shares Short (prior month",
+    "Shares Short (prior month)",
 ]
 
 
@@ -110,7 +110,7 @@ def parse_keystats(sp500_df, stock_df):
         "SP500_p_change",
     ] + features
 
-    df = pd.DataFrame(columns=df_columns)
+    df_rows = []
 
     # tqdm is a simple progress bar
     for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"):
@@ -214,8 +214,9 @@ def parse_keystats(sp500_df, stock_df):
                 sp500_p_change,
             ] + value_list
 
-            df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
+            df_rows.append(dict(zip(df_columns, new_df_row)))
 
+    df = pd.DataFrame(df_rows, columns=df_columns)
     # Remove rows with missing stock price data
     df.dropna(axis=0, subset=["Price", "stock_p_change"], inplace=True)
     # Output the CSV
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ff3344ea..0f7d79e2 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,5 +1,6 @@
 import os
 import pandas as pd
+import pytest
 
 import parsing_keystats
 import stock_prediction
@@ -92,3 +93,33 @@ def test_stock_prediction_dataset():
     assert X.shape[0] == df.shape[0] - num_rows_with_nan
     assert len(y) == df.shape[0] - num_rows_with_nan
     assert X.shape[1] == len(parsing_keystats.features)
+
+
+def test_features_names_complete():
+    """
+    Bug Fix 2: Feature names should be complete and consistent.
+    This test verifies that feature names have proper parentheses matching.
+    """
+    for feature in parsing_keystats.features:
+        # Check that parentheses are balanced
+        open_count = feature.count('(')
+        close_count = feature.count(')')
+        assert open_count == close_count, f"Feature '{feature}' has unbalanced parentheses"
+
+
+def test_pandas_append_deprecated_fix():
+    """
+    Bug Fix 1: pandas.DataFrame.append is deprecated.
+    This test verifies that parse_keystats no longer uses the deprecated append method.
+    Instead, it should collect rows in a list and create the DataFrame at the end.
+    """
+    import inspect
+    source = inspect.getsource(parsing_keystats.parse_keystats)
+
+    # Verify that append is not used on DataFrame
+    assert ".append(" not in source, "parse_keystats should not use deprecated DataFrame.append()"
+
+    # Verify that the new pattern is used (collecting rows in a list)
+    assert "df_rows = []" in source, "parse_keystats should use list to collect rows"
+    assert "df_rows.append(" in source, "parse_keystats should append to list, not DataFrame"
+    assert "pd.DataFrame(df_rows" in source, "parse_keystats should create DataFrame from list"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8016c5bf..47bc78b7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -39,7 +39,34 @@ def test_data_string_to_float():
 
     with pytest.raises(ValueError):
         utils.data_string_to_float(">0x")
-    with pytest.raises(ValueError):
-        utils.data_string_to_float("10k")
     with pytest.raises(ValueError):
         utils.data_string_to_float("2KB")
+
+
+def test_data_string_to_float_case_insensitive():
+    """
+    Bug Fix 3: data_string_to_float should be case-insensitive for K, M, B suffixes.
+    This test verifies that both uppercase and lowercase unit suffixes are handled correctly.
+    """
+    # Test lowercase 'k', 'm', 'b'
+    assert utils.data_string_to_float("10k") == 10000
+    assert utils.data_string_to_float("5m") == 5000000
+    assert utils.data_string_to_float("2.5b") == 2500000000
+
+    # Test mixed case
+    assert utils.data_string_to_float("10K") == 10000
+    assert utils.data_string_to_float("10k") == 10000
+    assert utils.data_string_to_float("5M") == 5000000
+    assert utils.data_string_to_float("5m") == 5000000
+
+    # Test negative values with lowercase
+    assert utils.data_string_to_float("-100.1k") == -100100
+    assert utils.data_string_to_float("-0.1m") == -100000
+    assert utils.data_string_to_float("-0.02b") == -20000000
+
+    # Test NaN variations
+    assert utils.data_string_to_float("nan") == "N/A"
+    assert utils.data_string_to_float("NaN") == "N/A"
+    assert utils.data_string_to_float("NAN") == "N/A"
+    assert utils.data_string_to_float("n/a") == "N/A"
+    assert utils.data_string_to_float("N/a") == "N/A"
diff --git a/utils.py b/utils.py
index 445860bb..6380a0af 100644
--- a/utils.py
+++ b/utils.py
@@ -10,16 +10,17 @@ def data_string_to_float(number_string):
     :return: a float representation of the string, taking into account minus sign, unit, etc.
     """
     # Deal with zeroes and the sign
-    if ("N/A" in number_string) or ("NaN" in number_string):
+    number_string_upper = number_string.upper()
+    if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
         return "N/A"
     elif number_string == ">0":
         return 0
-    elif "B" in number_string:
-        return float(number_string.replace("B", "")) * 1000000000
-    elif "M" in number_string:
-        return float(number_string.replace("M", "")) * 1000000
-    elif "K" in number_string:
-        return float(number_string.replace("K", "")) * 1000
+    elif "B" in number_string_upper:
+        return float(number_string_upper.replace("B", "")) * 1000000000
+    elif "M" in number_string_upper:
+        return float(number_string_upper.replace("M", "")) * 1000000
+    elif "K" in number_string_upper:
+        return float(number_string_upper.replace("K", "")) * 1000
     else:
         return float(number_string)