From 2151ec78c44c5a023e205cf2bfb0dc208858ee3c Mon Sep 17 00:00:00 2001 From: chenziqi66 <1304114564@qq.com> Date: Mon, 30 Mar 2026 11:07:49 +0800 Subject: [PATCH] =?UTF-8?q?=E8=BF=99=E6=98=AF=E4=B8=80=E4=B8=AA=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E6=9C=BA=E5=99=A8=E5=AD=A6=E4=B9=A0=E6=8A=80=E6=9C=AF?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=82=A1=E7=A5=A8=E9=A2=84=E6=B5=8B=E7=9A=84?= =?UTF-8?q?=E5=85=A5=E9=97=A8=E9=A1=B9=E7=9B=AE=E3=80=82=E5=AE=83=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E4=BA=86=E4=B8=80=E4=B8=AA=E5=AE=8C=E6=95=B4=E7=9A=84?= =?UTF-8?q?=E6=A1=86=E6=9E=B6=EF=BC=8C=E7=94=A8=E4=BA=8E=E9=80=9A=E8=BF=87?= =?UTF-8?q?=E5=88=86=E6=9E=90=E8=82=A1=E7=A5=A8=E7=9A=84=E5=9F=BA=E6=9C=AC?= =?UTF-8?q?=E9=9D=A2=E6=95=B0=E6=8D=AE=E6=9D=A5=E9=A2=84=E6=B5=8B=E8=82=A1?= =?UTF-8?q?=E7=A5=A8=E4=BB=B7=E6=A0=BC=E8=B5=B0=E5=8A=BF=E3=80=82=20?= =?UTF-8?q?=E6=88=91=E5=8F=91=E7=8E=B0=E4=BA=86=E4=B8=8B=E9=9D=A2=E7=9A=84?= =?UTF-8?q?=E5=87=A0=E4=B8=AAbug=EF=BC=8C=E8=AF=B7=E5=B8=AE=E6=88=91?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20Bug=201:=20parsing=5Fkeystats.py=20-=20pan?= =?UTF-8?q?das=20append=E5=BC=83=E7=94=A8=20Bug=202:=20=E7=89=B9=E5=BE=81?= =?UTF-8?q?=E5=90=8D=E4=B8=8D=E5=AE=8C=E6=95=B4/=E4=B8=8D=E4=B8=80?= =?UTF-8?q?=E8=87=B4=20Bug=203:=20data=5Fstring=5Fto=5Ffloat=20=E5=A4=A7?= =?UTF-8?q?=E5=B0=8F=E5=86=99=E6=95=8F=E6=84=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 约束条件: 不要修改原本代码的逻辑 生成测试文档 --- BUGFIXES.md | 136 +++++++++++++++++++++++++++++++++++++++++ parsing_keystats.py | 9 +-- tests/test_datasets.py | 31 ++++++++++ tests/test_utils.py | 31 +++++++++- utils.py | 15 ++--- 5 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 BUGFIXES.md diff --git a/BUGFIXES.md b/BUGFIXES.md new file mode 100644 index 00000000..9c3ca42a --- /dev/null +++ b/BUGFIXES.md @@ -0,0 +1,136 @@ +# Bug修复文档 + +本文档记录了项目中发现的Bug及其修复方案。 + +## Bug 1: parsing_keystats.py - pandas append弃用 + +### 问题描述 +`pandas.DataFrame.append()` 方法在新版本的pandas中已被弃用,并将在未来版本中移除。使用该方法会导致`FutureWarning`或`DeprecationWarning`。 + +### 修复方案 +将 `df.append()` 替换为使用列表收集行数据,最后通过 `pd.DataFrame()` 创建DataFrame的方式。 + +### 代码变更 +**文件**: `parsing_keystats.py` + +```python +# 修复前 +df = pd.DataFrame(columns=df_columns) +... +df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) + +# 修复后 +df_rows = [] +... +df_rows.append(dict(zip(df_columns, new_df_row))) +... +df = pd.DataFrame(df_rows, columns=df_columns) +``` + +### 测试用例 +测试文件: `tests/test_datasets.py` +- `test_pandas_append_deprecated_fix()`: 验证代码不再使用已弃用的`append`方法 + +--- + +## Bug 2: 特征名不完整/不一致 + +### 问题描述 +`parsing_keystats.py`中的特征列表包含两个不完整的特征名: +- `"Shares Short (as of"` - 缺少右括号 +- `"Shares Short (prior month"` - 缺少右括号 + +这会导致正则表达式匹配失败或数据解析不正确。 + +### 修复方案 +为这两个特征名添加缺失的右括号。 + +### 代码变更 +**文件**: `parsing_keystats.py` + +```python +# 修复前 +"Shares Short (as of", +... +"Shares Short (prior month", + +# 修复后 +"Shares Short (as of)", +... +"Shares Short (prior month)", +``` + +### 测试用例 +测试文件: `tests/test_datasets.py` +- `test_features_names_complete()`: 验证所有特征名的括号都是平衡的 + +--- + +## Bug 3: data_string_to_float 大小写敏感 + +### 问题描述 +`utils.py`中的`data_string_to_float`函数在处理数字单位后缀(K、M、B)和NaN值时是大小写敏感的。这会导致: +- `"10k"` 无法被正确识别为10000 +- `"nan"` 或 `"NAN"` 无法被识别为N/A + +### 修复方案 +将输入字符串转换为大写后再进行匹配处理。 + +### 代码变更 +**文件**: `utils.py` + +```python +# 修复前 +def data_string_to_float(number_string): + if ("N/A" in number_string) or ("NaN" in number_string): + return "N/A" + elif "B" in number_string: + return float(number_string.replace("B", "")) * 1000000000 + elif "M" in number_string: + return float(number_string.replace("M", "")) * 1000000 + elif "K" in number_string: + return float(number_string.replace("K", "")) * 1000 + +# 修复后 +def data_string_to_float(number_string): + number_string_upper = number_string.upper() + if ("N/A" in number_string_upper) or ("NAN" in number_string_upper): + return "N/A" + elif "B" in number_string_upper: + return float(number_string_upper.replace("B", "")) * 1000000000 + elif "M" in number_string_upper: + return float(number_string_upper.replace("M", "")) * 1000000 + elif "K" in number_string_upper: + return float(number_string_upper.replace("K", "")) * 1000 +``` + +### 测试用例 +测试文件: `tests/test_utils.py` +- `test_data_string_to_float_case_insensitive()`: 验证大小写不敏感的处理 + - 测试小写单位: `10k`, `5m`, `2.5b` + - 测试混合大小写: `10K` vs `10k` + - 测试负值与小写单位: `-100.1k`, `-0.1m`, `-0.02b` + - 测试NaN变体: `nan`, `NaN`, `NAN`, `n/a`, `N/a` + +--- + +## 运行测试 + +要运行所有测试,请执行: + +```bash +pytest tests/ +``` + +要运行特定的Bug修复测试: + +```bash +# Bug 1 测试 +pytest tests/test_datasets.py::test_pandas_append_deprecated_fix + +# Bug 2 测试 +pytest tests/test_datasets.py::test_features_names_complete + +# Bug 3 测试 +pytest tests/test_utils.py::test_data_string_to_float_case_insensitive +``` diff --git a/parsing_keystats.py b/parsing_keystats.py index ec12da51..20a3e075 100644 --- a/parsing_keystats.py +++ b/parsing_keystats.py @@ -51,10 +51,10 @@ "Float", "% Held by Insiders", "% Held by Institutions", - "Shares Short (as of", + "Shares Short (as of)", "Short Ratio", "Short % of Float", - "Shares Short (prior month", + "Shares Short (prior month)", ] @@ -110,7 +110,7 @@ def parse_keystats(sp500_df, stock_df): "SP500_p_change", ] + features - df = pd.DataFrame(columns=df_columns) + df_rows = [] # tqdm is a simple progress bar for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"): @@ -214,8 +214,9 @@ def parse_keystats(sp500_df, stock_df): sp500_p_change, ] + value_list - df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) + df_rows.append(dict(zip(df_columns, new_df_row))) + df = pd.DataFrame(df_rows, columns=df_columns) # Remove rows with missing stock price data df.dropna(axis=0, subset=["Price", "stock_p_change"], inplace=True) # Output the CSV diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ff3344ea..0f7d79e2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ import os import pandas as pd +import pytest import parsing_keystats import stock_prediction @@ -92,3 +93,33 @@ def test_stock_prediction_dataset(): assert X.shape[0] == df.shape[0] - num_rows_with_nan assert len(y) == df.shape[0] - num_rows_with_nan assert X.shape[1] == len(parsing_keystats.features) + + +def test_features_names_complete(): + """ + Bug Fix 2: Feature names should be complete and consistent. + This test verifies that feature names have proper parentheses matching. + """ + for feature in parsing_keystats.features: + # Check that parentheses are balanced + open_count = feature.count('(') + close_count = feature.count(')') + assert open_count == close_count, f"Feature '{feature}' has unbalanced parentheses" + + +def test_pandas_append_deprecated_fix(): + """ + Bug Fix 1: pandas.DataFrame.append is deprecated. + This test verifies that parse_keystats no longer uses the deprecated append method. + Instead, it should collect rows in a list and create the DataFrame at the end. + """ + import inspect + source = inspect.getsource(parsing_keystats.parse_keystats) + + # Verify that append is not used on DataFrame + assert ".append(" not in source, "parse_keystats should not use deprecated DataFrame.append()" + + # Verify that the new pattern is used (collecting rows in a list) + assert "df_rows = []" in source, "parse_keystats should use list to collect rows" + assert "df_rows.append(" in source, "parse_keystats should append to list, not DataFrame" + assert "pd.DataFrame(df_rows" in source, "parse_keystats should create DataFrame from list" diff --git a/tests/test_utils.py b/tests/test_utils.py index 8016c5bf..47bc78b7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,7 +39,34 @@ def test_data_string_to_float(): with pytest.raises(ValueError): utils.data_string_to_float(">0x") - with pytest.raises(ValueError): - utils.data_string_to_float("10k") with pytest.raises(ValueError): utils.data_string_to_float("2KB") + + +def test_data_string_to_float_case_insensitive(): + """ + Bug Fix 3: data_string_to_float should be case-insensitive for K, M, B suffixes. + This test verifies that both uppercase and lowercase unit suffixes are handled correctly. + """ + # Test lowercase 'k', 'm', 'b' + assert utils.data_string_to_float("10k") == 10000 + assert utils.data_string_to_float("5m") == 5000000 + assert utils.data_string_to_float("2.5b") == 2500000000 + + # Test mixed case + assert utils.data_string_to_float("10K") == 10000 + assert utils.data_string_to_float("10k") == 10000 + assert utils.data_string_to_float("5M") == 5000000 + assert utils.data_string_to_float("5m") == 5000000 + + # Test negative values with lowercase + assert utils.data_string_to_float("-100.1k") == -100100 + assert utils.data_string_to_float("-0.1m") == -100000 + assert utils.data_string_to_float("-0.02b") == -20000000 + + # Test NaN variations + assert utils.data_string_to_float("nan") == "N/A" + assert utils.data_string_to_float("NaN") == "N/A" + assert utils.data_string_to_float("NAN") == "N/A" + assert utils.data_string_to_float("n/a") == "N/A" + assert utils.data_string_to_float("N/a") == "N/A" diff --git a/utils.py b/utils.py index 445860bb..6380a0af 100644 --- a/utils.py +++ b/utils.py @@ -10,16 +10,17 @@ def data_string_to_float(number_string): :return: a float representation of the string, taking into account minus sign, unit, etc. """ # Deal with zeroes and the sign - if ("N/A" in number_string) or ("NaN" in number_string): + number_string_upper = number_string.upper() + if ("N/A" in number_string_upper) or ("NAN" in number_string_upper): return "N/A" elif number_string == ">0": return 0 - elif "B" in number_string: - return float(number_string.replace("B", "")) * 1000000000 - elif "M" in number_string: - return float(number_string.replace("M", "")) * 1000000 - elif "K" in number_string: - return float(number_string.replace("K", "")) * 1000 + elif "B" in number_string_upper: + return float(number_string_upper.replace("B", "")) * 1000000000 + elif "M" in number_string_upper: + return float(number_string_upper.replace("M", "")) * 1000000 + elif "K" in number_string_upper: + return float(number_string_upper.replace("K", "")) * 1000 else: return float(number_string)