Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions BUGFIXES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Bug修复文档

本文档记录了项目中发现的Bug及其修复方案。

## Bug 1: parsing_keystats.py - pandas append弃用

### 问题描述
`pandas.DataFrame.append()` 方法在新版本的pandas中已被弃用,并将在未来版本中移除。使用该方法会导致`FutureWarning`或`DeprecationWarning`。

### 修复方案
将 `df.append()` 替换为使用列表收集行数据,最后通过 `pd.DataFrame()` 创建DataFrame的方式。

### 代码变更
**文件**: `parsing_keystats.py`

```python
# 修复前
df = pd.DataFrame(columns=df_columns)
...
df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)

# 修复后
df_rows = []
...
df_rows.append(dict(zip(df_columns, new_df_row)))
...
df = pd.DataFrame(df_rows, columns=df_columns)
```

### 测试用例
测试文件: `tests/test_datasets.py`
- `test_pandas_append_deprecated_fix()`: 验证代码不再使用已弃用的`append`方法

---

## Bug 2: 特征名不完整/不一致

### 问题描述
`parsing_keystats.py`中的特征列表包含两个不完整的特征名:
- `"Shares Short (as of"` - 缺少右括号
- `"Shares Short (prior month"` - 缺少右括号

这会导致正则表达式匹配失败或数据解析不正确。

### 修复方案
为这两个特征名添加缺失的右括号。

### 代码变更
**文件**: `parsing_keystats.py`

```python
# 修复前
"Shares Short (as of",
...
"Shares Short (prior month",

# 修复后
"Shares Short (as of)",
...
"Shares Short (prior month)",
```

### 测试用例
测试文件: `tests/test_datasets.py`
- `test_features_names_complete()`: 验证所有特征名的括号都是平衡的

---

## Bug 3: data_string_to_float 大小写敏感

### 问题描述
`utils.py`中的`data_string_to_float`函数在处理数字单位后缀(K、M、B)和NaN值时是大小写敏感的。这会导致:
- `"10k"` 无法被正确识别为10000
- `"nan"` 或 `"NAN"` 无法被识别为N/A

### 修复方案
将输入字符串转换为大写后再进行匹配处理。

### 代码变更
**文件**: `utils.py`

```python
# 修复前
def data_string_to_float(number_string):
if ("N/A" in number_string) or ("NaN" in number_string):
return "N/A"
elif "B" in number_string:
return float(number_string.replace("B", "")) * 1000000000
elif "M" in number_string:
return float(number_string.replace("M", "")) * 1000000
elif "K" in number_string:
return float(number_string.replace("K", "")) * 1000

# 修复后
def data_string_to_float(number_string):
number_string_upper = number_string.upper()
if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
return "N/A"
elif "B" in number_string_upper:
return float(number_string_upper.replace("B", "")) * 1000000000
elif "M" in number_string_upper:
return float(number_string_upper.replace("M", "")) * 1000000
elif "K" in number_string_upper:
return float(number_string_upper.replace("K", "")) * 1000
```

### 测试用例
测试文件: `tests/test_utils.py`
- `test_data_string_to_float_case_insensitive()`: 验证大小写不敏感的处理
- 测试小写单位: `10k`, `5m`, `2.5b`
- 测试混合大小写: `10K` vs `10k`
- 测试负值与小写单位: `-100.1k`, `-0.1m`, `-0.02b`
- 测试NaN变体: `nan`, `NaN`, `NAN`, `n/a`, `N/a`

---

## 运行测试

要运行所有测试,请执行:

```bash
pytest tests/
```

要运行特定的Bug修复测试:

```bash
# Bug 1 测试
pytest tests/test_datasets.py::test_pandas_append_deprecated_fix

# Bug 2 测试
pytest tests/test_datasets.py::test_features_names_complete

# Bug 3 测试
pytest tests/test_utils.py::test_data_string_to_float_case_insensitive
```
9 changes: 5 additions & 4 deletions parsing_keystats.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@
"Float",
"% Held by Insiders",
"% Held by Institutions",
"Shares Short (as of",
"Shares Short (as of)",
"Short Ratio",
"Short % of Float",
"Shares Short (prior month",
"Shares Short (prior month)",
]


Expand Down Expand Up @@ -110,7 +110,7 @@ def parse_keystats(sp500_df, stock_df):
"SP500_p_change",
] + features

df = pd.DataFrame(columns=df_columns)
df_rows = []

# tqdm is a simple progress bar
for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"):
Expand Down Expand Up @@ -214,8 +214,9 @@ def parse_keystats(sp500_df, stock_df):
sp500_p_change,
] + value_list

df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True)
df_rows.append(dict(zip(df_columns, new_df_row)))

df = pd.DataFrame(df_rows, columns=df_columns)
# Remove rows with missing stock price data
df.dropna(axis=0, subset=["Price", "stock_p_change"], inplace=True)
# Output the CSV
Expand Down
31 changes: 31 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import pytest

import parsing_keystats
import stock_prediction
Expand Down Expand Up @@ -92,3 +93,33 @@ def test_stock_prediction_dataset():
assert X.shape[0] == df.shape[0] - num_rows_with_nan
assert len(y) == df.shape[0] - num_rows_with_nan
assert X.shape[1] == len(parsing_keystats.features)


def test_features_names_complete():
"""
Bug Fix 2: Feature names should be complete and consistent.
This test verifies that feature names have proper parentheses matching.
"""
for feature in parsing_keystats.features:
# Check that parentheses are balanced
open_count = feature.count('(')
close_count = feature.count(')')
assert open_count == close_count, f"Feature '{feature}' has unbalanced parentheses"


def test_pandas_append_deprecated_fix():
"""
Bug Fix 1: pandas.DataFrame.append is deprecated.
This test verifies that parse_keystats no longer uses the deprecated append method.
Instead, it should collect rows in a list and create the DataFrame at the end.
"""
import inspect
source = inspect.getsource(parsing_keystats.parse_keystats)

# Verify that append is not used on DataFrame
assert ".append(" not in source, "parse_keystats should not use deprecated DataFrame.append()"

# Verify that the new pattern is used (collecting rows in a list)
assert "df_rows = []" in source, "parse_keystats should use list to collect rows"
assert "df_rows.append(" in source, "parse_keystats should append to list, not DataFrame"
assert "pd.DataFrame(df_rows" in source, "parse_keystats should create DataFrame from list"
31 changes: 29 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,34 @@ def test_data_string_to_float():

with pytest.raises(ValueError):
utils.data_string_to_float(">0x")
with pytest.raises(ValueError):
utils.data_string_to_float("10k")
with pytest.raises(ValueError):
utils.data_string_to_float("2KB")


def test_data_string_to_float_case_insensitive():
"""
Bug Fix 3: data_string_to_float should be case-insensitive for K, M, B suffixes.
This test verifies that both uppercase and lowercase unit suffixes are handled correctly.
"""
# Test lowercase 'k', 'm', 'b'
assert utils.data_string_to_float("10k") == 10000
assert utils.data_string_to_float("5m") == 5000000
assert utils.data_string_to_float("2.5b") == 2500000000

# Test mixed case
assert utils.data_string_to_float("10K") == 10000
assert utils.data_string_to_float("10k") == 10000
assert utils.data_string_to_float("5M") == 5000000
assert utils.data_string_to_float("5m") == 5000000

# Test negative values with lowercase
assert utils.data_string_to_float("-100.1k") == -100100
assert utils.data_string_to_float("-0.1m") == -100000
assert utils.data_string_to_float("-0.02b") == -20000000

# Test NaN variations
assert utils.data_string_to_float("nan") == "N/A"
assert utils.data_string_to_float("NaN") == "N/A"
assert utils.data_string_to_float("NAN") == "N/A"
assert utils.data_string_to_float("n/a") == "N/A"
assert utils.data_string_to_float("N/a") == "N/A"
15 changes: 8 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,17 @@ def data_string_to_float(number_string):
:return: a float representation of the string, taking into account minus sign, unit, etc.
"""
# Deal with zeroes and the sign
if ("N/A" in number_string) or ("NaN" in number_string):
number_string_upper = number_string.upper()
if ("N/A" in number_string_upper) or ("NAN" in number_string_upper):
return "N/A"
elif number_string == ">0":
return 0
elif "B" in number_string:
return float(number_string.replace("B", "")) * 1000000000
elif "M" in number_string:
return float(number_string.replace("M", "")) * 1000000
elif "K" in number_string:
return float(number_string.replace("K", "")) * 1000
elif "B" in number_string_upper:
return float(number_string_upper.replace("B", "")) * 1000000000
elif "M" in number_string_upper:
return float(number_string_upper.replace("M", "")) * 1000000
elif "K" in number_string_upper:
return float(number_string_upper.replace("K", "")) * 1000
else:
return float(number_string)

Expand Down