exploratory-data-analysis/analysis.py at main · sbourgeous/exploratory-data-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import matplotlib.pyplot as plt

# Load CPU dataset
data = pd.read_csv("CPU_benchmark_v4.csv")

# Print function for overview
print("\n" + "="*55)
print("---- Dataset Overview ----")
print("="*55)
print(f"Total CPUs in dataset: {len(data)}")
print("First 10 Rows (as they appear in the dataset):")
print(data.head(10))

# Print function for category
print("\n" + "="*55)
print("Which CPU category has the highest average performance?")
print("="*55)

# Legend for categories
print("\nCategory Legend:")
print("  Desktop = Personal computers")
print("  Server = High-power servers")
print("  Laptop = Portable computers")
print("  Mobile/Embedded = Smartphones, tablets, IoT devices")
print("  (Combinations = CPUs designed for multiple device types)\n")

# Calculate average benchmark score by category
avg_by_category = data.groupby("category")["cpuMark"].mean().sort_values(ascending=False)

# Graph results
avg_by_category.plot(kind="bar")
plt.title("Average CPU Performance by Category")
plt.xlabel("CPU Category")
plt.ylabel("Average CPU Mark")
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()

# Function for showing best category
best_category = avg_by_category.index[0]
print(f"\nAnswer: {best_category.upper()} CPUs have the highest average benchmark score ({avg_by_category.iloc[0]:.2f})")

# Print function for filter
print("\n" + "="*55)
print("FILTER: CPUs with benchmark score above 50,000")
print("="*55)

# Filter CPUs with high performance scores
high_performance_cpus = data[data["cpuMark"] > 50000]
print(f"Number of high-performance CPUs (cpuMark > 50,000): {len(high_performance_cpus)}")
print(f"Performance range: {high_performance_cpus['cpuMark'].min():.0f} to {high_performance_cpus['cpuMark'].max():.0f}")
print(high_performance_cpus[["cpuName", "category", "cpuMark"]])

# Print function for sort
print("\n" + "="*55)
print("SORT: CPUs sorted by LOWEST benchmark score (ascending)")
print("="*55)

# Sort CPUs by performance in ascending order
sorted_cpus_ascending = data.sort_values(by="cpuMark", ascending=True)
print("Lowest 10 Performing CPUs:")
print(sorted_cpus_ascending[["cpuName", "category", "cpuMark"]].head(10))

# Print function for aggregation
print("\n" + "="*55)
print("AGGREGATION: CPU Performance Statistics")
print("="*55)

# Calculate performance statistics
stats = data["cpuMark"].describe()
print(f"\nInsights:")
print(f"  - Average CPU performance: {stats['mean']:.2f}")
print(f"  - Median CPU performance: {stats['50%']:.2f}")
print(f"  - Standard Deviation: {stats['std']:.2f}")
print(f"  - Performance Range: {stats['min']:.0f} to {stats['max']:.0f}")
print(f"  - CPUs above average: {len(data[data['cpuMark'] > stats['mean']])}")
print(f"  - CPUs below average: {len(data[data['cpuMark'] < stats['mean']])}")

# Print function for price-to-performance
print("\n" + "="*55)
print("What has the best price-to-performance ratio for CPUs?")
print("="*55)

# Calculate price-to-performance ratio for CPUs with pricing data
data_with_prices = data.dropna(subset=["price", "cpuMark"]).copy()
data_with_prices["price_per_mark"] = data_with_prices["price"] / data_with_prices["cpuMark"]

# Find CPUs with lowest price per benchmark point
best_value = data_with_prices.nsmallest(5, "price_per_mark")
print("\nAnswer: The Top 5 CPUs with the Best Price-to-Performance Ratio:")
print(best_value[["cpuName", "price", "cpuMark", "price_per_mark"]].round(5).to_string())
print()