diff --git a/BUGFIXES.md b/BUGFIXES.md new file mode 100644 index 00000000..9c3ca42a --- /dev/null +++ b/BUGFIXES.md @@ -0,0 +1,136 @@ +# Bug修复文档 + +本文档记录了项目中发现的Bug及其修复方案。 + +## Bug 1: parsing_keystats.py - pandas append弃用 + +### 问题描述 +`pandas.DataFrame.append()` 方法在新版本的pandas中已被弃用,并将在未来版本中移除。使用该方法会导致`FutureWarning`或`DeprecationWarning`。 + +### 修复方案 +将 `df.append()` 替换为使用列表收集行数据,最后通过 `pd.DataFrame()` 创建DataFrame的方式。 + +### 代码变更 +**文件**: `parsing_keystats.py` + +```python +# 修复前 +df = pd.DataFrame(columns=df_columns) +... +df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) + +# 修复后 +df_rows = [] +... +df_rows.append(dict(zip(df_columns, new_df_row))) +... +df = pd.DataFrame(df_rows, columns=df_columns) +``` + +### 测试用例 +测试文件: `tests/test_datasets.py` +- `test_pandas_append_deprecated_fix()`: 验证代码不再使用已弃用的`append`方法 + +--- + +## Bug 2: 特征名不完整/不一致 + +### 问题描述 +`parsing_keystats.py`中的特征列表包含两个不完整的特征名: +- `"Shares Short (as of"` - 缺少右括号 +- `"Shares Short (prior month"` - 缺少右括号 + +这会导致正则表达式匹配失败或数据解析不正确。 + +### 修复方案 +为这两个特征名添加缺失的右括号。 + +### 代码变更 +**文件**: `parsing_keystats.py` + +```python +# 修复前 +"Shares Short (as of", +... +"Shares Short (prior month", + +# 修复后 +"Shares Short (as of)", +... +"Shares Short (prior month)", +``` + +### 测试用例 +测试文件: `tests/test_datasets.py` +- `test_features_names_complete()`: 验证所有特征名的括号都是平衡的 + +--- + +## Bug 3: data_string_to_float 大小写敏感 + +### 问题描述 +`utils.py`中的`data_string_to_float`函数在处理数字单位后缀(K、M、B)和NaN值时是大小写敏感的。这会导致: +- `"10k"` 无法被正确识别为10000 +- `"nan"` 或 `"NAN"` 无法被识别为N/A + +### 修复方案 +将输入字符串转换为大写后再进行匹配处理。 + +### 代码变更 +**文件**: `utils.py` + +```python +# 修复前 +def data_string_to_float(number_string): + if ("N/A" in number_string) or ("NaN" in number_string): + return "N/A" + elif "B" in number_string: + return float(number_string.replace("B", "")) * 1000000000 + elif "M" in number_string: + return float(number_string.replace("M", "")) * 1000000 + elif "K" in number_string: + return float(number_string.replace("K", "")) * 1000 + +# 修复后 +def data_string_to_float(number_string): + number_string_upper = number_string.upper() + if ("N/A" in number_string_upper) or ("NAN" in number_string_upper): + return "N/A" + elif "B" in number_string_upper: + return float(number_string_upper.replace("B", "")) * 1000000000 + elif "M" in number_string_upper: + return float(number_string_upper.replace("M", "")) * 1000000 + elif "K" in number_string_upper: + return float(number_string_upper.replace("K", "")) * 1000 +``` + +### 测试用例 +测试文件: `tests/test_utils.py` +- `test_data_string_to_float_case_insensitive()`: 验证大小写不敏感的处理 + - 测试小写单位: `10k`, `5m`, `2.5b` + - 测试混合大小写: `10K` vs `10k` + - 测试负值与小写单位: `-100.1k`, `-0.1m`, `-0.02b` + - 测试NaN变体: `nan`, `NaN`, `NAN`, `n/a`, `N/a` + +--- + +## 运行测试 + +要运行所有测试,请执行: + +```bash +pytest tests/ +``` + +要运行特定的Bug修复测试: + +```bash +# Bug 1 测试 +pytest tests/test_datasets.py::test_pandas_append_deprecated_fix + +# Bug 2 测试 +pytest tests/test_datasets.py::test_features_names_complete + +# Bug 3 测试 +pytest tests/test_utils.py::test_data_string_to_float_case_insensitive +``` diff --git a/parsing_keystats.py b/parsing_keystats.py index ec12da51..20a3e075 100644 --- a/parsing_keystats.py +++ b/parsing_keystats.py @@ -51,10 +51,10 @@ "Float", "% Held by Insiders", "% Held by Institutions", - "Shares Short (as of", + "Shares Short (as of)", "Short Ratio", "Short % of Float", - "Shares Short (prior month", + "Shares Short (prior month)", ] @@ -110,7 +110,7 @@ def parse_keystats(sp500_df, stock_df): "SP500_p_change", ] + features - df = pd.DataFrame(columns=df_columns) + df_rows = [] # tqdm is a simple progress bar for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"): @@ -214,8 +214,9 @@ def parse_keystats(sp500_df, stock_df): sp500_p_change, ] + value_list - df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) + df_rows.append(dict(zip(df_columns, new_df_row))) + df = pd.DataFrame(df_rows, columns=df_columns) # Remove rows with missing stock price data df.dropna(axis=0, subset=["Price", "stock_p_change"], inplace=True) # Output the CSV diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ff3344ea..0f7d79e2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ import os import pandas as pd +import pytest import parsing_keystats import stock_prediction @@ -92,3 +93,33 @@ def test_stock_prediction_dataset(): assert X.shape[0] == df.shape[0] - num_rows_with_nan assert len(y) == df.shape[0] - num_rows_with_nan assert X.shape[1] == len(parsing_keystats.features) + + +def test_features_names_complete(): + """ + Bug Fix 2: Feature names should be complete and consistent. + This test verifies that feature names have proper parentheses matching. + """ + for feature in parsing_keystats.features: + # Check that parentheses are balanced + open_count = feature.count('(') + close_count = feature.count(')') + assert open_count == close_count, f"Feature '{feature}' has unbalanced parentheses" + + +def test_pandas_append_deprecated_fix(): + """ + Bug Fix 1: pandas.DataFrame.append is deprecated. + This test verifies that parse_keystats no longer uses the deprecated append method. + Instead, it should collect rows in a list and create the DataFrame at the end. + """ + import inspect + source = inspect.getsource(parsing_keystats.parse_keystats) + + # Verify that append is not used on DataFrame + assert ".append(" not in source, "parse_keystats should not use deprecated DataFrame.append()" + + # Verify that the new pattern is used (collecting rows in a list) + assert "df_rows = []" in source, "parse_keystats should use list to collect rows" + assert "df_rows.append(" in source, "parse_keystats should append to list, not DataFrame" + assert "pd.DataFrame(df_rows" in source, "parse_keystats should create DataFrame from list" diff --git a/tests/test_utils.py b/tests/test_utils.py index 8016c5bf..47bc78b7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,7 +39,34 @@ def test_data_string_to_float(): with pytest.raises(ValueError): utils.data_string_to_float(">0x") - with pytest.raises(ValueError): - utils.data_string_to_float("10k") with pytest.raises(ValueError): utils.data_string_to_float("2KB") + + +def test_data_string_to_float_case_insensitive(): + """ + Bug Fix 3: data_string_to_float should be case-insensitive for K, M, B suffixes. + This test verifies that both uppercase and lowercase unit suffixes are handled correctly. + """ + # Test lowercase 'k', 'm', 'b' + assert utils.data_string_to_float("10k") == 10000 + assert utils.data_string_to_float("5m") == 5000000 + assert utils.data_string_to_float("2.5b") == 2500000000 + + # Test mixed case + assert utils.data_string_to_float("10K") == 10000 + assert utils.data_string_to_float("10k") == 10000 + assert utils.data_string_to_float("5M") == 5000000 + assert utils.data_string_to_float("5m") == 5000000 + + # Test negative values with lowercase + assert utils.data_string_to_float("-100.1k") == -100100 + assert utils.data_string_to_float("-0.1m") == -100000 + assert utils.data_string_to_float("-0.02b") == -20000000 + + # Test NaN variations + assert utils.data_string_to_float("nan") == "N/A" + assert utils.data_string_to_float("NaN") == "N/A" + assert utils.data_string_to_float("NAN") == "N/A" + assert utils.data_string_to_float("n/a") == "N/A" + assert utils.data_string_to_float("N/a") == "N/A" diff --git a/utils.py b/utils.py index 445860bb..6380a0af 100644 --- a/utils.py +++ b/utils.py @@ -10,16 +10,17 @@ def data_string_to_float(number_string): :return: a float representation of the string, taking into account minus sign, unit, etc. """ # Deal with zeroes and the sign - if ("N/A" in number_string) or ("NaN" in number_string): + number_string_upper = number_string.upper() + if ("N/A" in number_string_upper) or ("NAN" in number_string_upper): return "N/A" elif number_string == ">0": return 0 - elif "B" in number_string: - return float(number_string.replace("B", "")) * 1000000000 - elif "M" in number_string: - return float(number_string.replace("M", "")) * 1000000 - elif "K" in number_string: - return float(number_string.replace("K", "")) * 1000 + elif "B" in number_string_upper: + return float(number_string_upper.replace("B", "")) * 1000000000 + elif "M" in number_string_upper: + return float(number_string_upper.replace("M", "")) * 1000000 + elif "K" in number_string_upper: + return float(number_string_upper.replace("K", "")) * 1000 else: return float(number_string)