robertmartin8 · chenziqi66 · Mar 30, 2026
diff --git a/BUGFIXES.md b/BUGFIXES.md
@@ -0,0 +1,136 @@
+# Bug修复文档
+
+本文档记录了项目中发现的Bug及其修复方案。
+
+## Bug 1: parsing_keystats.py - pandas append弃用
+
+### 问题描述
+`pandas.DataFrame.append()` 方法在新版本的pandas中已被弃用，并将在未来版本中移除。使用该方法会导致`FutureWarning`或`DeprecationWarning`。
+
+### 修复方案
+将 `df.append()` 替换为使用列表收集行数据，最后通过 `pd.DataFrame()` 创建DataFrame的方式。
+
+### 代码变更
+**文件**: `parsing_keystats.py`
+
+```python
+# 修复前
+df = pd.DataFrame(columns=df_columns)
+...
+df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
+
+# 修复后
+df_rows = []
+...
+df_rows.append(dict(zip(df_columns, new_df_row)))
+...
+df = pd.DataFrame(df_rows, columns=df_columns)
+```
+
+### 测试用例
+测试文件: `tests/test_datasets.py`
+- `test_pandas_append_deprecated_fix()`: 验证代码不再使用已弃用的`append`方法
+
+---
+
+## Bug 2: 特征名不完整/不一致
+
+### 问题描述
+`parsing_keystats.py`中的特征列表包含两个不完整的特征名：
+- `"Shares Short (as of"` - 缺少右括号
+- `"Shares Short (prior month"` - 缺少右括号
+
+这会导致正则表达式匹配失败或数据解析不正确。
+
+### 修复方案
+为这两个特征名添加缺失的右括号。
+
+### 代码变更
+**文件**: `parsing_keystats.py`
+
+```python
+# 修复前
+"Shares Short (as of",
+...
+"Shares Short (prior month",
+
+# 修复后
+"Shares Short (as of)",
+...
+"Shares Short (prior month)",
+```
+
+### 测试用例
+测试文件: `tests/test_datasets.py`
+- `test_features_names_complete()`: 验证所有特征名的括号都是平衡的
+
+---
+
+## Bug 3: data_string_to_float 大小写敏感
+
+### 问题描述
+`utils.py`中的`data_string_to_float`函数在处理数字单位后缀（K、M、B）和NaN值时是大小写敏感的。这会导致：
+- `"10k"` 无法被正确识别为10000
+- `"nan"` 或 `"NAN"` 无法被识别为N/A
+
+### 修复方案
+将输入字符串转换为大写后再进行匹配处理。
+
+### 代码变更
+**文件**: `utils.py`
+
+```python
+# 修复前
+def data_string_to_float(number_string):
+    if ("N/A" in number_string) or ("NaN" in number_string):
+        return "N/A"
+    elif "B" in number_string:
+        return float(number_string.replace("B", "")) * 1000000000
+    elif "M" in number_string:
+        return float(number_string.replace("M", "")) * 1000000
+    elif "K" in number_string:
+        return float(number_string.replace("K", "")) * 1000
+
+# 修复后
+def data_string_to_float(number_string):
+    number_string_upper = number_string.upper()
+    if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
+        return "N/A"
+    elif "B" in number_string_upper:
+        return float(number_string_upper.replace("B", "")) * 1000000000
+    elif "M" in number_string_upper:
+        return float(number_string_upper.replace("M", "")) * 1000000
+    elif "K" in number_string_upper:
+        return float(number_string_upper.replace("K", "")) * 1000
+```
+
+### 测试用例
+测试文件: `tests/test_utils.py`
+- `test_data_string_to_float_case_insensitive()`: 验证大小写不敏感的处理
+  - 测试小写单位: `10k`, `5m`, `2.5b`
+  - 测试混合大小写: `10K` vs `10k`
+  - 测试负值与小写单位: `-100.1k`, `-0.1m`, `-0.02b`
+  - 测试NaN变体: `nan`, `NaN`, `NAN`, `n/a`, `N/a`
+
+---
+
+## 运行测试
+
+要运行所有测试，请执行：
+
+```bash
+pytest tests/
+```
+
+要运行特定的Bug修复测试：
+
+```bash
+# Bug 1 测试
+pytest tests/test_datasets.py::test_pandas_append_deprecated_fix
+
+# Bug 2 测试
+pytest tests/test_datasets.py::test_features_names_complete
+
+# Bug 3 测试
+pytest tests/test_utils.py::test_data_string_to_float_case_insensitive
+```
diff --git a/parsing_keystats.py b/parsing_keystats.py
@@ -51,10 +51,10 @@
     "Float",
     "% Held by Insiders",
     "% Held by Institutions",
-    "Shares Short (as of",
+    "Shares Short (as of)",
     "Short Ratio",
     "Short % of Float",
-    "Shares Short (prior month",
+    "Shares Short (prior month)",
 ]
 
 
@@ -110,7 +110,7 @@ def parse_keystats(sp500_df, stock_df):
         "SP500_p_change",
     ] + features
 
-    df = pd.DataFrame(columns=df_columns)
+    df_rows = []
 
     # tqdm is a simple progress bar
     for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"):
@@ -214,8 +214,9 @@ def parse_keystats(sp500_df, stock_df):
                 sp500_p_change,
             ] + value_list
 
-            df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
+            df_rows.append(dict(zip(df_columns, new_df_row)))
 
+    df = pd.DataFrame(df_rows, columns=df_columns)
     # Remove rows with missing stock price data
     df.dropna(axis=0, subset=["Price", "stock_p_change"], inplace=True)
     # Output the CSV

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -1,5 +1,6 @@
 import os
 import pandas as pd
+import pytest
 
 import parsing_keystats
 import stock_prediction
@@ -92,3 +93,33 @@ def test_stock_prediction_dataset():
     assert X.shape[0] == df.shape[0] - num_rows_with_nan
     assert len(y) == df.shape[0] - num_rows_with_nan
     assert X.shape[1] == len(parsing_keystats.features)
+
+
+def test_features_names_complete():
+    """
+    Bug Fix 2: Feature names should be complete and consistent.
+    This test verifies that feature names have proper parentheses matching.
+    """
+    for feature in parsing_keystats.features:
+        # Check that parentheses are balanced
+        open_count = feature.count('(')
+        close_count = feature.count(')')
+        assert open_count == close_count, f"Feature '{feature}' has unbalanced parentheses"
+
+
+def test_pandas_append_deprecated_fix():
+    """
+    Bug Fix 1: pandas.DataFrame.append is deprecated.
+    This test verifies that parse_keystats no longer uses the deprecated append method.
+    Instead, it should collect rows in a list and create the DataFrame at the end.
+    """
+    import inspect
+    source = inspect.getsource(parsing_keystats.parse_keystats)
+
+    # Verify that append is not used on DataFrame
+    assert ".append(" not in source, "parse_keystats should not use deprecated DataFrame.append()"
+
+    # Verify that the new pattern is used (collecting rows in a list)
+    assert "df_rows = []" in source, "parse_keystats should use list to collect rows"
+    assert "df_rows.append(" in source, "parse_keystats should append to list, not DataFrame"
+    assert "pd.DataFrame(df_rows" in source, "parse_keystats should create DataFrame from list"
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -39,7 +39,34 @@ def test_data_string_to_float():
 
     with pytest.raises(ValueError):
         utils.data_string_to_float(">0x")
-    with pytest.raises(ValueError):
-        utils.data_string_to_float("10k")
     with pytest.raises(ValueError):
         utils.data_string_to_float("2KB")
+
+
+def test_data_string_to_float_case_insensitive():
+    """
+    Bug Fix 3: data_string_to_float should be case-insensitive for K, M, B suffixes.
+    This test verifies that both uppercase and lowercase unit suffixes are handled correctly.
+    """
+    # Test lowercase 'k', 'm', 'b'
+    assert utils.data_string_to_float("10k") == 10000
+    assert utils.data_string_to_float("5m") == 5000000
+    assert utils.data_string_to_float("2.5b") == 2500000000
+
+    # Test mixed case
+    assert utils.data_string_to_float("10K") == 10000
+    assert utils.data_string_to_float("10k") == 10000
+    assert utils.data_string_to_float("5M") == 5000000
+    assert utils.data_string_to_float("5m") == 5000000
+
+    # Test negative values with lowercase
+    assert utils.data_string_to_float("-100.1k") == -100100
+    assert utils.data_string_to_float("-0.1m") == -100000
+    assert utils.data_string_to_float("-0.02b") == -20000000
+
+    # Test NaN variations
+    assert utils.data_string_to_float("nan") == "N/A"
+    assert utils.data_string_to_float("NaN") == "N/A"
+    assert utils.data_string_to_float("NAN") == "N/A"
+    assert utils.data_string_to_float("n/a") == "N/A"
+    assert utils.data_string_to_float("N/a") == "N/A"
diff --git a/utils.py b/utils.py
@@ -10,16 +10,17 @@ def data_string_to_float(number_string):
     :return: a float representation of the string, taking into account minus sign, unit, etc.
     """
     # Deal with zeroes and the sign
-    if ("N/A" in number_string) or ("NaN" in number_string):
+    number_string_upper = number_string.upper()
+    if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
         return "N/A"
     elif number_string == ">0":
         return 0
-    elif "B" in number_string:
-        return float(number_string.replace("B", "")) * 1000000000
-    elif "M" in number_string:
-        return float(number_string.replace("M", "")) * 1000000
-    elif "K" in number_string:
-        return float(number_string.replace("K", "")) * 1000
+    elif "B" in number_string_upper:
+        return float(number_string_upper.replace("B", "")) * 1000000000
+    elif "M" in number_string_upper:
+        return float(number_string_upper.replace("M", "")) * 1000000
+    elif "K" in number_string_upper:
+        return float(number_string_upper.replace("K", "")) * 1000
     else:
         return float(number_string)