186 lines
5.4 KiB
Python
186 lines
5.4 KiB
Python
"""
|
||
A股历史数据获取系统
|
||
功能:获取所有A股从2010年至今的历史行情数据
|
||
"""
|
||
|
||
import tushare as ts
|
||
import pandas as pd
|
||
import os
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
# 配置
|
||
BASE_DIR = Path(__file__).parent
|
||
DATA_DIR = BASE_DIR / 'data'
|
||
LOGS_DIR = BASE_DIR / 'logs'
|
||
STOCK_LIST_FILE = BASE_DIR / 'A股股票列表.csv'
|
||
|
||
# 数据保存目录
|
||
DATA_DIR.mkdir(exist_ok=True)
|
||
LOGS_DIR.mkdir(exist_ok=True)
|
||
|
||
# 时间范围
|
||
START_DATE = '20100101'
|
||
END_DATE = datetime.now().strftime('%Y%m%d')
|
||
|
||
# 每批次获取的股票数量(tushare限制)
|
||
BATCH_SIZE = 50
|
||
|
||
# 请求间隔(秒)- 避免频繁请求
|
||
REQUEST_INTERVAL = 0.3
|
||
|
||
|
||
def setup_tushare(token=None):
|
||
"""初始化tushare"""
|
||
import os
|
||
|
||
# 优先级:参数 > 环境变量 > 配置文件
|
||
if not token:
|
||
token = os.environ.get('TUSHARE_TOKEN', '')
|
||
|
||
if not token:
|
||
# 尝试从配置文件读取
|
||
config_file = BASE_DIR / 'config.txt'
|
||
if config_file.exists():
|
||
token = config_file.read_text().strip()
|
||
|
||
if not token:
|
||
print("错误:未设置 Tushare Token!")
|
||
print("请通过以下方式之一设置:")
|
||
print(" 1. 设置环境变量: export TUSHARE_TOKEN=your_token")
|
||
print(" 2. 创建配置文件: echo 'your_token' > config.txt")
|
||
print(" 3. 注册地址: https://tushare.pro/register")
|
||
raise ValueError("缺少 Tushare Token")
|
||
|
||
ts.set_token(token)
|
||
return ts.pro_api()
|
||
|
||
|
||
def load_stock_list():
|
||
"""加载股票列表"""
|
||
df = pd.read_csv(STOCK_LIST_FILE)
|
||
# 清理列名中的空白
|
||
df.columns = df.columns.str.strip()
|
||
print(f"加载股票列表: {len(df)} 只股票")
|
||
return df
|
||
|
||
|
||
def get_stock_codes_with_suffix(df):
|
||
"""将股票代码转换为tushare格式(添加后缀)"""
|
||
codes = []
|
||
for code in df['code']:
|
||
code = str(code).zfill(6) # 补零到6位
|
||
if code.startswith('6'):
|
||
ts_code = f"{code}.SH"
|
||
else:
|
||
ts_code = f"{code}.SZ"
|
||
codes.append(ts_code)
|
||
return codes
|
||
|
||
|
||
def fetch_daily_data(pro, codes, start_date, end_date):
|
||
"""批量获取日线数据"""
|
||
all_data = []
|
||
total = len(codes)
|
||
|
||
for i in range(0, total, BATCH_SIZE):
|
||
batch_codes = codes[i:i + BATCH_SIZE]
|
||
ts_codes = ','.join(batch_codes)
|
||
|
||
try:
|
||
print(f"获取第 {i+1}-{min(i+BATCH_SIZE, total)} 只股票数据...")
|
||
df = pro.daily(ts_code=ts_codes, start_date=start_date, end_date=end_date)
|
||
|
||
if df is not None and len(df) > 0:
|
||
all_data.append(df)
|
||
print(f" 成功获取 {len(df)} 条记录")
|
||
else:
|
||
print(f" 无数据")
|
||
|
||
except Exception as e:
|
||
print(f" 错误: {e}")
|
||
|
||
# 避免请求过快
|
||
time.sleep(REQUEST_INTERVAL)
|
||
|
||
return all_data
|
||
|
||
|
||
def save_to_parquet(df, filename):
|
||
"""保存为parquet格式(高效压缩)"""
|
||
filepath = DATA_DIR / filename
|
||
df.to_parquet(filepath, index=False)
|
||
print(f"保存到: {filepath}")
|
||
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
|
||
|
||
|
||
def save_to_csv(df, filename):
|
||
"""保存为CSV格式"""
|
||
filepath = DATA_DIR / filename
|
||
df.to_csv(filepath, index=False)
|
||
print(f"保存到: {filepath}")
|
||
print(f"文件大小: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("=" * 60)
|
||
print("A股历史数据获取系统")
|
||
print("=" * 60)
|
||
print(f"数据时间范围: {START_DATE} ~ {END_DATE}")
|
||
print(f"数据保存目录: {DATA_DIR}")
|
||
print("=" * 60)
|
||
|
||
# 初始化tushare
|
||
print("\n初始化 Tushare...")
|
||
pro = setup_tushare()
|
||
|
||
# 加载股票列表
|
||
print("\n加载股票列表...")
|
||
stock_df = load_stock_list()
|
||
codes = get_stock_codes_with_suffix(stock_df)
|
||
print(f"共 {len(codes)} 只股票")
|
||
|
||
# 获取日线数据
|
||
print("\n开始获取日线数据...")
|
||
all_data = fetch_daily_data(pro, codes, START_DATE, END_DATE)
|
||
|
||
if all_data:
|
||
# 合并所有数据
|
||
print("\n合并数据...")
|
||
combined_df = pd.concat(all_data, ignore_index=True)
|
||
print(f"总记录数: {len(combined_df)}")
|
||
|
||
# 按日期排序
|
||
combined_df = combined_df.sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
|
||
|
||
# 保存数据
|
||
print("\n保存数据...")
|
||
timestamp = datetime.now().strftime('%Y%m%d')
|
||
|
||
# 保存为parquet(推荐,压缩率高)
|
||
save_to_parquet(combined_df, f'A股日线数据_{timestamp}.parquet')
|
||
|
||
# 同时保存为CSV(方便查看)
|
||
save_to_csv(combined_df, f'A股日线数据_{timestamp}.csv')
|
||
|
||
# 显示数据概览
|
||
print("\n数据概览:")
|
||
print(f" 股票数量: {combined_df['ts_code'].nunique()}")
|
||
print(f" 日期范围: {combined_df['trade_date'].min()} ~ {combined_df['trade_date'].max()}")
|
||
print(f" 总记录数: {len(combined_df)}")
|
||
print("\n列名:")
|
||
print(combined_df.columns.tolist())
|
||
print("\n前5条数据:")
|
||
print(combined_df.head())
|
||
else:
|
||
print("未获取到任何数据")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("数据获取完成!")
|
||
print("=" * 60)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main() |