feat: A股历史数据获取系统初始版本
功能: - 获取所有A股从2010年至今的历史行情数据 - 支持parquet和CSV两种格式保存 - 自动处理请求频率限制 - 进度显示和错误处理 环境:conda环境 stock_system (Python 3.10)
This commit is contained in:
23
.gitignore
vendored
Normal file
23
.gitignore
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.pyo
|
||||
.env
|
||||
|
||||
# Data files (large)
|
||||
data/*.parquet
|
||||
data/*.csv
|
||||
!A股股票列表.csv
|
||||
|
||||
# Logs
|
||||
logs/
|
||||
*.log
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
5492
A股股票列表.csv
Normal file
5492
A股股票列表.csv
Normal file
File diff suppressed because it is too large
Load Diff
77
README.md
Normal file
77
README.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# A股历史数据获取系统
|
||||
|
||||
获取所有A股从2010年至今的历史行情数据。
|
||||
|
||||
## 环境配置
|
||||
|
||||
```bash
|
||||
# 创建conda环境
|
||||
conda create -n stock_system python=3.10 -y
|
||||
conda activate stock_system
|
||||
|
||||
# 安装依赖
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Tushare Token
|
||||
|
||||
需要在 tushare.pro 注册并获取token。
|
||||
|
||||
设置方式:
|
||||
1. 注册账号:https://tushare.pro/register
|
||||
2. 获取token后在代码中设置,或设置环境变量:
|
||||
```bash
|
||||
export TUSHARE_TOKEN=your_token_here
|
||||
```
|
||||
|
||||
## 运行
|
||||
|
||||
```bash
|
||||
# 方式1:直接运行
|
||||
python fetch_history.py
|
||||
|
||||
# 方式2:使用脚本
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
## 数据说明
|
||||
|
||||
- 数据来源:Tushare Pro API
|
||||
- 时间范围:2010-01-01 至今
|
||||
- 数据格式:
|
||||
- parquet:高效压缩格式(推荐)
|
||||
- CSV:通用格式
|
||||
|
||||
## 输出字段
|
||||
|
||||
| 字段 | 说明 |
|
||||
|------|------|
|
||||
| ts_code | 股票代码 |
|
||||
| trade_date | 交易日期 |
|
||||
| open | 开盘价 |
|
||||
| high | 最高价 |
|
||||
| low | 最低价 |
|
||||
| close | 收盘价 |
|
||||
| pre_close | 昨收价 |
|
||||
| change | 涨跌额 |
|
||||
| pct_chg | 涨跌幅(%) |
|
||||
| vol | 成交量(手) |
|
||||
| amount | 成交额(千元) |
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
stock_system/
|
||||
├── A股股票列表.csv # 股票列表
|
||||
├── fetch_history.py # 数据获取脚本
|
||||
├── requirements.txt # Python依赖
|
||||
├── run.sh # 运行脚本
|
||||
├── data/ # 数据保存目录
|
||||
└── logs/ # 日志目录
|
||||
```
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. Tushare有请求频率限制,代码中设置了间隔
|
||||
2. 全量获取约5000只股票数据需要较长时间
|
||||
3. 建议在网络稳定的环境下运行
|
||||
167
fetch_history.py
Normal file
167
fetch_history.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
A股历史数据获取系统
|
||||
功能:获取所有A股从2010年至今的历史行情数据
|
||||
"""
|
||||
|
||||
import tushare as ts
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# 配置
|
||||
BASE_DIR = Path(__file__).parent
|
||||
DATA_DIR = BASE_DIR / 'data'
|
||||
LOGS_DIR = BASE_DIR / 'logs'
|
||||
STOCK_LIST_FILE = BASE_DIR / 'A股股票列表.csv'
|
||||
|
||||
# 数据保存目录
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
LOGS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# 时间范围
|
||||
START_DATE = '20100101'
|
||||
END_DATE = datetime.now().strftime('%Y%m%d')
|
||||
|
||||
# 每批次获取的股票数量(tushare限制)
|
||||
BATCH_SIZE = 50
|
||||
|
||||
# 请求间隔(秒)- 避免频繁请求
|
||||
REQUEST_INTERVAL = 0.3
|
||||
|
||||
|
||||
def setup_tushare(token=None):
|
||||
"""初始化tushare"""
|
||||
if token:
|
||||
ts.set_token(token)
|
||||
return ts.pro_api()
|
||||
|
||||
|
||||
def load_stock_list():
|
||||
"""加载股票列表"""
|
||||
df = pd.read_csv(STOCK_LIST_FILE)
|
||||
# 清理列名中的空白
|
||||
df.columns = df.columns.str.strip()
|
||||
print(f"加载股票列表: {len(df)} 只股票")
|
||||
return df
|
||||
|
||||
|
||||
def get_stock_codes_with_suffix(df):
|
||||
"""将股票代码转换为tushare格式(添加后缀)"""
|
||||
codes = []
|
||||
for code in df['code']:
|
||||
code = str(code).zfill(6) # 补零到6位
|
||||
if code.startswith('6'):
|
||||
ts_code = f"{code}.SH"
|
||||
else:
|
||||
ts_code = f"{code}.SZ"
|
||||
codes.append(ts_code)
|
||||
return codes
|
||||
|
||||
|
||||
def fetch_daily_data(pro, codes, start_date, end_date):
|
||||
"""批量获取日线数据"""
|
||||
all_data = []
|
||||
total = len(codes)
|
||||
|
||||
for i in range(0, total, BATCH_SIZE):
|
||||
batch_codes = codes[i:i + BATCH_SIZE]
|
||||
ts_codes = ','.join(batch_codes)
|
||||
|
||||
try:
|
||||
print(f"获取第 {i+1}-{min(i+BATCH_SIZE, total)} 只股票数据...")
|
||||
df = pro.daily(ts_code=ts_codes, start_date=start_date, end_date=end_date)
|
||||
|
||||
if df is not None and len(df) > 0:
|
||||
all_data.append(df)
|
||||
print(f" 成功获取 {len(df)} 条记录")
|
||||
else:
|
||||
print(f" 无数据")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 错误: {e}")
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(REQUEST_INTERVAL)
|
||||
|
||||
return all_data
|
||||
|
||||
|
||||
def save_to_parquet(df, filename):
|
||||
"""保存为parquet格式(高效压缩)"""
|
||||
filepath = DATA_DIR / filename
|
||||
df.to_parquet(filepath, index=False)
|
||||
print(f"保存到: {filepath}")
|
||||
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
|
||||
|
||||
|
||||
def save_to_csv(df, filename):
|
||||
"""保存为CSV格式"""
|
||||
filepath = DATA_DIR / filename
|
||||
df.to_csv(filepath, index=False)
|
||||
print(f"保存到: {filepath}")
|
||||
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("=" * 60)
|
||||
print("A股历史数据获取系统")
|
||||
print("=" * 60)
|
||||
print(f"数据时间范围: {START_DATE} ~ {END_DATE}")
|
||||
print(f"数据保存目录: {DATA_DIR}")
|
||||
print("=" * 60)
|
||||
|
||||
# 初始化tushare
|
||||
print("\n初始化 Tushare...")
|
||||
pro = setup_tushare()
|
||||
|
||||
# 加载股票列表
|
||||
print("\n加载股票列表...")
|
||||
stock_df = load_stock_list()
|
||||
codes = get_stock_codes_with_suffix(stock_df)
|
||||
print(f"共 {len(codes)} 只股票")
|
||||
|
||||
# 获取日线数据
|
||||
print("\n开始获取日线数据...")
|
||||
all_data = fetch_daily_data(pro, codes, START_DATE, END_DATE)
|
||||
|
||||
if all_data:
|
||||
# 合并所有数据
|
||||
print("\n合并数据...")
|
||||
combined_df = pd.concat(all_data, ignore_index=True)
|
||||
print(f"总记录数: {len(combined_df)}")
|
||||
|
||||
# 按日期排序
|
||||
combined_df = combined_df.sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
|
||||
|
||||
# 保存数据
|
||||
print("\n保存数据...")
|
||||
timestamp = datetime.now().strftime('%Y%m%d')
|
||||
|
||||
# 保存为parquet(推荐,压缩率高)
|
||||
save_to_parquet(combined_df, f'A股日线数据_{timestamp}.parquet')
|
||||
|
||||
# 同时保存为CSV(方便查看)
|
||||
save_to_csv(combined_df, f'A股日线数据_{timestamp}.csv')
|
||||
|
||||
# 显示数据概览
|
||||
print("\n数据概览:")
|
||||
print(f" 股票数量: {combined_df['ts_code'].nunique()}")
|
||||
print(f" 日期范围: {combined_df['trade_date'].min()} ~ {combined_df['trade_date'].max()}")
|
||||
print(f" 总记录数: {len(combined_df)}")
|
||||
print("\n列名:")
|
||||
print(combined_df.columns.tolist())
|
||||
print("\n前5条数据:")
|
||||
print(combined_df.head())
|
||||
else:
|
||||
print("未获取到任何数据")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("数据获取完成!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
tushare>=1.4.0
|
||||
pandas>=2.0.0
|
||||
Reference in New Issue
Block a user