NumPy是Python科学计算的基础库,提供高性能的多维数组对象和数学函数。
pip install numpy
import numpy as np
# 从列表创建数组
arr1 = np.array([1, 2, 3, 4, 5])
print(arr1) # [1 2 3 4 5]
# 创建二维数组
arr2 = np.array([[1, 2, 3], [4, 5, 6]])
print(arr2)
# [[1 2 3]
# [4 5 6]]
# 创建特殊数组
zeros = np.zeros((3, 4)) # 全0数组
ones = np.ones((2, 3)) # 全1数组
empty = np.empty((2, 2)) # 空数组
arange = np.arange(0, 10, 2) # [0 2 4 6 8]
linspace = np.linspace(0, 1, 5) # 5个均匀分布的数
# 随机数组
random_arr = np.random.rand(3, 3) # 0-1之间的随机数
random_int = np.random.randint(0, 10, (3, 3)) # 随机整数
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
# 数组属性
print(arr.shape) # (5,)
print(arr.dtype) # int64
print(arr.ndim) # 1
print(arr.size) # 5
# 数组运算
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
print(arr1 + arr2) # [5 7 9]
print(arr1 * arr2) # [4 10 18]
print(arr1 ** 2) # [1 4 9]
# 数学函数
print(np.sqrt(arr1)) # 平方根
print(np.exp(arr1)) # 指数
print(np.sin(arr1)) # 正弦
print(np.log(arr1)) # 对数
# 统计函数
arr = np.array([1, 2, 3, 4, 5])
print(np.mean(arr)) # 平均值: 3.0
print(np.median(arr)) # 中位数: 3.0
print(np.std(arr)) # 标准差
print(np.sum(arr)) # 求和: 15
print(np.min(arr)) # 最小值: 1
print(np.max(arr)) # 最大值: 5
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# 索引
print(arr[0, 0]) # 1
print(arr[1, 2]) # 6
# 切片
print(arr[0:2, 1:3])
# [[2 3]
# [5 6]]
# 布尔索引
print(arr[arr > 5]) # [6 7 8 9]
# 条件筛选
mask = arr % 2 == 0
print(arr[mask]) # [2 4 6 8]
Pandas是强大的数据分析库,提供DataFrame和Series数据结构。
pip install pandas
import pandas as pd
# 创建Series
s = pd.Series([1, 2, 3, 4, 5])
print(s)
# 带索引的Series
s = pd.Series([100, 200, 300], index=['a', 'b', 'c'])
print(s['a']) # 100
# 从字典创建
data = {'a': 100, 'b': 200, 'c': 300}
s = pd.Series(data)
print(s)
import pandas as pd
# 创建DataFrame
data = {
'name': ['张三', '李四', '王五'],
'age': [25, 30, 35],
'city': ['北京', '上海', '广州']
}
df = pd.DataFrame(data)
print(df)
# 查看数据
print(df.head()) # 前5行
print(df.tail()) # 后5行
print(df.info()) # 数据信息
print(df.describe()) # 统计摘要
# 访问列
print(df['name'])
print(df[['name', 'age']])
# 访问行
print(df.loc[0]) # 按标签
print(df.iloc[0]) # 按位置
# 条件筛选
print(df[df['age'] > 25])
print(df[(df['age'] > 25) & (df['city'] == '上海')])
import pandas as pd
df = pd.DataFrame({
'name': ['张三', '李四', '王五'],
'age': [25, 30, 35],
'salary': [5000, 8000, 10000]
})
# 添加列
df['bonus'] = df['salary'] * 0.1
# 删除列
df = df.drop('bonus', axis=1)
# 添加行
new_row = {'name': '赵六', 'age': 28, 'salary': 7000}
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
# 删除行
df = df.drop(0)
# 排序
df_sorted = df.sort_values('age', ascending=False)
# 分组统计
grouped = df.groupby('age')['salary'].mean()
print(grouped)
import pandas as pd
# 读取CSV
df = pd.read_csv('data.csv')
# 读取Excel
df = pd.read_excel('data.xlsx')
# 读取JSON
df = pd.read_json('data.json')
# 写入CSV
df.to_csv('output.csv', index=False)
# 写入Excel
df.to_excel('output.xlsx', index=False)
# 写入JSON
df.to_json('output.json', orient='records')
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})
# 检查缺失值
print(df.isnull())
print(df.isnull().sum())
# 删除缺失值
df_dropped = df.dropna()
# 填充缺失值
df_filled = df.fillna(0)
df_filled = df.fillna(df.mean())
# 删除重复行
df_unique = df.drop_duplicates()
# 重命名列
df = df.rename(columns={'A': 'col1', 'B': 'col2'})
Matplotlib是Python的绘图库,用于创建各种图表。
pip install matplotlib
import matplotlib.pyplot as plt
import numpy as np
# 折线图
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.title('正弦函数')
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.grid(True)
plt.show()
# 多条线
plt.plot(x, np.sin(x), label='sin(x)')
plt.plot(x, np.cos(x), label='cos(x)')
plt.legend()
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# 散点图
x = np.random.rand(50)
y = np.random.rand(50)
plt.scatter(x, y)
plt.title('散点图')
plt.show()
# 柱状图
categories = ['A', 'B', 'C', 'D']
values = [25, 40, 30, 55]
plt.bar(categories, values)
plt.title('柱状图')
plt.show()
# 饼图
sizes = [25, 30, 20, 25]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('饼图')
plt.show()
# 直方图
data = np.random.randn(1000)
plt.hist(data, bins=30)
plt.title('直方图')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('sin(x)')
axes[0, 1].plot(x, np.cos(x))
axes[0, 1].set_title('cos(x)')
axes[1, 0].plot(x, np.tan(x))
axes[1, 0].set_title('tan(x)')
axes[1, 1].plot(x, x**2)
axes[1, 1].set_title('x²')
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 创建示例数据
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=100)
data = {
'date': dates,
'sales': np.random.randint(100, 1000, 100),
'customers': np.random.randint(10, 100, 100)
}
df = pd.DataFrame(data)
# 数据分析
print("销售统计:")
print(df['sales'].describe())
# 计算移动平均
df['sales_ma'] = df['sales'].rolling(window=7).mean()
# 按月统计
df['month'] = df['date'].dt.month
monthly_sales = df.groupby('month')['sales'].sum()
# 可视化
fig, axes = plt.subplots(2, 1, figsize=(12, 8))
# 销售趋势
axes[0].plot(df['date'], df['sales'], label='日销售额')
axes[0].plot(df['date'], df['sales_ma'], label='7日移动平均', linewidth=2)
axes[0].set_title('销售趋势')
axes[0].legend()
axes[0].grid(True)
# 月度销售
axes[1].bar(monthly_sales.index, monthly_sales.values)
axes[1].set_title('月度销售统计')
axes[1].set_xlabel('月份')
axes[1].set_ylabel('销售额')
plt.tight_layout()
plt.show()
# 练习1: NumPy矩阵统计
import numpy as np
matrix = np.random.rand(10, 10)
print(f"均值: {np.mean(matrix):.4f}")
print(f"标准差: {np.std(matrix):.4f}")
print(f"最大值: {np.max(matrix):.4f}")
print(f"最小值: {np.min(matrix):.4f}")
# 练习2: 学生成绩分析
import pandas as pd
students = pd.DataFrame({
'name': ['张三', '李四', '王五', '赵六'],
'math': [85, 90, 78, 92],
'english': [88, 85, 90, 87],
'python': [92, 88, 85, 95]
})
students['total'] = students[['math', 'english', 'python']].sum(axis=1)
students['average'] = students[['math', 'english', 'python']].mean(axis=1)
print(students)
# 练习3: 数据清洗
import pandas as pd
import numpy as np
# 创建带缺失值的数据
df = pd.DataFrame({
'A': [1, 2, np.nan, 4, 5],
'B': [np.nan, 2, 3, 4, 5],
'C': [1, 2, 3, 4, 5]
})
print("原始数据:")
print(df)
print("\n缺失值统计:")
print(df.isnull().sum())
# 填充缺失值
df_cleaned = df.fillna(df.mean())
print("\n清洗后数据:")
print(df_cleaned)
print("\n统计报告:")
print(df_cleaned.describe())
# 练习4: 组合图表
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(1, 6)
y1 = [20, 35, 30, 35, 27]
y2 = [25, 32, 34, 20, 25]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# 折线图
ax1.plot(x, y1, marker='o', label='产品A')
ax1.plot(x, y2, marker='s', label='产品B')
ax1.set_title('销售趋势')
ax1.set_xlabel('月份')
ax1.set_ylabel('销售额')
ax1.legend()
ax1.grid(True)
# 柱状图
width = 0.35
ax2.bar(x - width/2, y1, width, label='产品A')
ax2.bar(x + width/2, y2, width, label='产品B')
ax2.set_title('月度对比')
ax2.set_xlabel('月份')
ax2.set_ylabel('销售额')
ax2.legend()
plt.tight_layout()
plt.show()