Files
stock_system/fetch_history.py

186 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
A股历史数据获取系统
功能获取所有A股从2010年至今的历史行情数据
"""
import tushare as ts
import pandas as pd
import os
import time
from datetime import datetime
from pathlib import Path
# 配置
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / 'data'
LOGS_DIR = BASE_DIR / 'logs'
STOCK_LIST_FILE = BASE_DIR / 'A股股票列表.csv'
# 数据保存目录
DATA_DIR.mkdir(exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)
# 时间范围
START_DATE = '20100101'
END_DATE = datetime.now().strftime('%Y%m%d')
# 每批次获取的股票数量tushare限制
BATCH_SIZE = 50
# 请求间隔(秒)- 避免频繁请求
REQUEST_INTERVAL = 0.3
def setup_tushare(token=None):
"""初始化tushare"""
import os
# 优先级:参数 > 环境变量 > 配置文件
if not token:
token = os.environ.get('TUSHARE_TOKEN', '')
if not token:
# 尝试从配置文件读取
config_file = BASE_DIR / 'config.txt'
if config_file.exists():
token = config_file.read_text().strip()
if not token:
print("错误:未设置 Tushare Token")
print("请通过以下方式之一设置:")
print(" 1. 设置环境变量: export TUSHARE_TOKEN=your_token")
print(" 2. 创建配置文件: echo 'your_token' > config.txt")
print(" 3. 注册地址: https://tushare.pro/register")
raise ValueError("缺少 Tushare Token")
ts.set_token(token)
return ts.pro_api()
def load_stock_list():
"""加载股票列表"""
df = pd.read_csv(STOCK_LIST_FILE)
# 清理列名中的空白
df.columns = df.columns.str.strip()
print(f"加载股票列表: {len(df)} 只股票")
return df
def get_stock_codes_with_suffix(df):
"""将股票代码转换为tushare格式添加后缀"""
codes = []
for code in df['code']:
code = str(code).zfill(6) # 补零到6位
if code.startswith('6'):
ts_code = f"{code}.SH"
else:
ts_code = f"{code}.SZ"
codes.append(ts_code)
return codes
def fetch_daily_data(pro, codes, start_date, end_date):
"""批量获取日线数据"""
all_data = []
total = len(codes)
for i in range(0, total, BATCH_SIZE):
batch_codes = codes[i:i + BATCH_SIZE]
ts_codes = ','.join(batch_codes)
try:
print(f"获取第 {i+1}-{min(i+BATCH_SIZE, total)} 只股票数据...")
df = pro.daily(ts_code=ts_codes, start_date=start_date, end_date=end_date)
if df is not None and len(df) > 0:
all_data.append(df)
print(f" 成功获取 {len(df)} 条记录")
else:
print(f" 无数据")
except Exception as e:
print(f" 错误: {e}")
# 避免请求过快
time.sleep(REQUEST_INTERVAL)
return all_data
def save_to_parquet(df, filename):
"""保存为parquet格式高效压缩"""
filepath = DATA_DIR / filename
df.to_parquet(filepath, index=False)
print(f"保存到: {filepath}")
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
def save_to_csv(df, filename):
"""保存为CSV格式"""
filepath = DATA_DIR / filename
df.to_csv(filepath, index=False)
print(f"保存到: {filepath}")
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
def main():
"""主函数"""
print("=" * 60)
print("A股历史数据获取系统")
print("=" * 60)
print(f"数据时间范围: {START_DATE} ~ {END_DATE}")
print(f"数据保存目录: {DATA_DIR}")
print("=" * 60)
# 初始化tushare
print("\n初始化 Tushare...")
pro = setup_tushare()
# 加载股票列表
print("\n加载股票列表...")
stock_df = load_stock_list()
codes = get_stock_codes_with_suffix(stock_df)
print(f"{len(codes)} 只股票")
# 获取日线数据
print("\n开始获取日线数据...")
all_data = fetch_daily_data(pro, codes, START_DATE, END_DATE)
if all_data:
# 合并所有数据
print("\n合并数据...")
combined_df = pd.concat(all_data, ignore_index=True)
print(f"总记录数: {len(combined_df)}")
# 按日期排序
combined_df = combined_df.sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
# 保存数据
print("\n保存数据...")
timestamp = datetime.now().strftime('%Y%m%d')
# 保存为parquet推荐压缩率高
save_to_parquet(combined_df, f'A股日线数据_{timestamp}.parquet')
# 同时保存为CSV方便查看
save_to_csv(combined_df, f'A股日线数据_{timestamp}.csv')
# 显示数据概览
print("\n数据概览:")
print(f" 股票数量: {combined_df['ts_code'].nunique()}")
print(f" 日期范围: {combined_df['trade_date'].min()} ~ {combined_df['trade_date'].max()}")
print(f" 总记录数: {len(combined_df)}")
print("\n列名:")
print(combined_df.columns.tolist())
print("\n前5条数据:")
print(combined_df.head())
else:
print("未获取到任何数据")
print("\n" + "=" * 60)
print("数据获取完成!")
print("=" * 60)
if __name__ == '__main__':
main()