第14课: 数据处理与分析

NumPy - 数值计算

NumPy是Python科学计算的基础库，提供高性能的多维数组对象和数学函数。

安装NumPy

pip install numpy

创建数组

import numpy as np

# 从列表创建数组
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1)  # [1 2 3 4 5]

# 创建二维数组
arr2 = np.array([[1, 2, 3], [4, 5, 6]])
print(arr2)
# [[1 2 3]
#  [4 5 6]]

# 创建特殊数组
zeros = np.zeros((3, 4))      # 全0数组
ones = np.ones((2, 3))        # 全1数组
empty = np.empty((2, 2))      # 空数组
arange = np.arange(0, 10, 2)  # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5)  # 5个均匀分布的数

# 随机数组
random_arr = np.random.rand(3, 3)  # 0-1之间的随机数
random_int = np.random.randint(0, 10, (3, 3))  # 随机整数

数组操作

import numpy as np

arr = np.array([1, 2, 3, 4, 5])

# 数组属性
print(arr.shape)   # (5,)
print(arr.dtype)   # int64
print(arr.ndim)    # 1
print(arr.size)    # 5

# 数组运算
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

print(arr1 + arr2)  # [5 7 9]
print(arr1 * arr2)  # [4 10 18]
print(arr1 ** 2)    # [1 4 9]

# 数学函数
print(np.sqrt(arr1))    # 平方根
print(np.exp(arr1))     # 指数
print(np.sin(arr1))     # 正弦
print(np.log(arr1))     # 对数

# 统计函数
arr = np.array([1, 2, 3, 4, 5])
print(np.mean(arr))     # 平均值: 3.0
print(np.median(arr))   # 中位数: 3.0
print(np.std(arr))      # 标准差
print(np.sum(arr))      # 求和: 15
print(np.min(arr))      # 最小值: 1
print(np.max(arr))      # 最大值: 5

数组索引和切片

import numpy as np

arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# 索引
print(arr[0, 0])    # 1
print(arr[1, 2])    # 6

# 切片
print(arr[0:2, 1:3])
# [[2 3]
#  [5 6]]

# 布尔索引
print(arr[arr > 5])  # [6 7 8 9]

# 条件筛选
mask = arr % 2 == 0
print(arr[mask])  # [2 4 6 8]

Pandas - 数据分析

Pandas是强大的数据分析库，提供DataFrame和Series数据结构。

安装Pandas

pip install pandas

Series - 一维数据

import pandas as pd

# 创建Series
s = pd.Series([1, 2, 3, 4, 5])
print(s)

# 带索引的Series
s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
print(s['a'])  # 100

# 从字典创建
data = {'a': 100, 'b': 200, 'c': 300}
s = pd.Series(data)
print(s)

DataFrame - 二维数据

import pandas as pd

# 创建DataFrame
data = {
    'name': ['张三', '李四', '王五'],
    'age': [25, 30, 35],
    'city': ['北京', '上海', '广州']
}
df = pd.DataFrame(data)
print(df)

# 查看数据
print(df.head())      # 前5行
print(df.tail())      # 后5行
print(df.info())      # 数据信息
print(df.describe())  # 统计摘要

# 访问列
print(df['name'])
print(df[['name', 'age']])

# 访问行
print(df.loc[0])      # 按标签
print(df.iloc[0])     # 按位置

# 条件筛选
print(df[df['age'] > 25])
print(df[(df['age'] > 25) & (df['city'] == '上海')])

数据操作

import pandas as pd

df = pd.DataFrame({
    'name': ['张三', '李四', '王五'],
    'age': [25, 30, 35],
    'salary': [5000, 8000, 10000]
})

# 添加列
df['bonus'] = df['salary'] * 0.1

# 删除列
df = df.drop('bonus', axis=1)

# 添加行
new_row = {'name': '赵六', 'age': 28, 'salary': 7000}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

# 删除行
df = df.drop(0)

# 排序
df_sorted = df.sort_values('age', ascending=False)

# 分组统计
grouped = df.groupby('age')['salary'].mean()
print(grouped)

读写文件

import pandas as pd

# 读取CSV
df = pd.read_csv('data.csv')

# 读取Excel
df = pd.read_excel('data.xlsx')

# 读取JSON
df = pd.read_json('data.json')

# 写入CSV
df.to_csv('output.csv', index=False)

# 写入Excel
df.to_excel('output.xlsx', index=False)

# 写入JSON
df.to_json('output.json', orient='records')

数据清洗

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})

# 检查缺失值
print(df.isnull())
print(df.isnull().sum())

# 删除缺失值
df_dropped = df.dropna()

# 填充缺失值
df_filled = df.fillna(0)
df_filled = df.fillna(df.mean())

# 删除重复行
df_unique = df.drop_duplicates()

# 重命名列
df = df.rename(columns={'A': 'col1', 'B': 'col2'})

Matplotlib - 数据可视化

Matplotlib是Python的绘图库，用于创建各种图表。

安装Matplotlib

pip install matplotlib

基本绘图

import matplotlib.pyplot as plt
import numpy as np

# 折线图
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.plot(x, y)
plt.title('正弦函数')
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.grid(True)
plt.show()

# 多条线
plt.plot(x, np.sin(x), label='sin(x)')
plt.plot(x, np.cos(x), label='cos(x)')
plt.legend()
plt.show()

常用图表

import matplotlib.pyplot as plt
import numpy as np

# 散点图
x = np.random.rand(50)
y = np.random.rand(50)
plt.scatter(x, y)
plt.title('散点图')
plt.show()

# 柱状图
categories = ['A', 'B', 'C', 'D']
values = [25, 40, 30, 55]
plt.bar(categories, values)
plt.title('柱状图')
plt.show()

# 饼图
sizes = [25, 30, 20, 25]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('饼图')
plt.show()

# 直方图
data = np.random.randn(1000)
plt.hist(data, bins=30)
plt.title('直方图')
plt.show()

子图

import matplotlib.pyplot as plt
import numpy as np

x = np.linspace(0, 10, 100)

fig, axes = plt.subplots(2, 2, figsize=(10, 8))

axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('sin(x)')

axes[0, 1].plot(x, np.cos(x))
axes[0, 1].set_title('cos(x)')

axes[1, 0].plot(x, np.tan(x))
axes[1, 0].set_title('tan(x)')

axes[1, 1].plot(x, x**2)
axes[1, 1].set_title('x²')

plt.tight_layout()
plt.show()

数据分析实战示例

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 创建示例数据
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=100)
data = {
    'date': dates,
    'sales': np.random.randint(100, 1000, 100),
    'customers': np.random.randint(10, 100, 100)
}
df = pd.DataFrame(data)

# 数据分析
print("销售统计:")
print(df['sales'].describe())

# 计算移动平均
df['sales_ma'] = df['sales'].rolling(window=7).mean()

# 按月统计
df['month'] = df['date'].dt.month
monthly_sales = df.groupby('month')['sales'].sum()

# 可视化
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# 销售趋势
axes[0].plot(df['date'], df['sales'], label='日销售额')
axes[0].plot(df['date'], df['sales_ma'], label='7日移动平均', linewidth=2)
axes[0].set_title('销售趋势')
axes[0].legend()
axes[0].grid(True)

# 月度销售
axes[1].bar(monthly_sales.index, monthly_sales.values)
axes[1].set_title('月度销售统计')
axes[1].set_xlabel('月份')
axes[1].set_ylabel('销售额')

plt.tight_layout()
plt.show()

练习

使用NumPy创建一个10x10的随机矩阵，计算其均值和标准差
创建一个学生成绩DataFrame，包含姓名、数学、英语、Python成绩，计算总分和平均分
读取CSV文件，进行数据清洗（处理缺失值），并生成统计报告
使用Matplotlib绘制一个包含折线图和柱状图的组合图表

练习答案:

# 练习1: NumPy矩阵统计
import numpy as np

matrix = np.random.rand(10, 10)
print(f"均值: {np.mean(matrix):.4f}")
print(f"标准差: {np.std(matrix):.4f}")
print(f"最大值: {np.max(matrix):.4f}")
print(f"最小值: {np.min(matrix):.4f}")

# 练习2: 学生成绩分析
import pandas as pd

students = pd.DataFrame({
    'name': ['张三', '李四', '王五', '赵六'],
    'math': [85, 90, 78, 92],
    'english': [88, 85, 90, 87],
    'python': [92, 88, 85, 95]
})

students['total'] = students[['math', 'english', 'python']].sum(axis=1)
students['average'] = students[['math', 'english', 'python']].mean(axis=1)

print(students)

# 练习3: 数据清洗
import pandas as pd
import numpy as np

# 创建带缺失值的数据
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1, 2, 3, 4, 5]
})

print("原始数据:")
print(df)

print("\n缺失值统计:")
print(df.isnull().sum())

# 填充缺失值
df_cleaned = df.fillna(df.mean())

print("\n清洗后数据:")
print(df_cleaned)

print("\n统计报告:")
print(df_cleaned.describe())

# 练习4: 组合图表
import matplotlib.pyplot as plt
import numpy as np

x = np.arange(1, 6)
y1 = [20, 35, 30, 35, 27]
y2 = [25, 32, 34, 20, 25]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# 折线图
ax1.plot(x, y1, marker='o', label='产品A')
ax1.plot(x, y2, marker='s', label='产品B')
ax1.set_title('销售趋势')
ax1.set_xlabel('月份')
ax1.set_ylabel('销售额')
ax1.legend()
ax1.grid(True)

# 柱状图
width = 0.35
ax2.bar(x - width/2, y1, width, label='产品A')
ax2.bar(x + width/2, y2, width, label='产品B')
ax2.set_title('月度对比')
ax2.set_xlabel('月份')
ax2.set_ylabel('销售额')
ax2.legend()

plt.tight_layout()
plt.show()

上一课下一课: 最佳实践与项目实战