Files
product-crawler/crawler/base.py
hubian 0ee0abbbd1 feat: 产品参数爬取系统 v1.0.0
功能:
- 多步骤爬取流程(入口页→列表页→详情页)
- 浏览器爬虫支持(Playwright,处理JS渲染)
- 比亚迪汽车爬虫示例
- 后台管理界面
- 数据存储和导出

技术栈:
- Python 3 + Flask
- Playwright (浏览器自动化)
- BeautifulSoup (HTML解析)

端口:
- API服务: 19011
- 后台管理: 19012
2026-04-10 00:45:51 +08:00

108 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
爬虫基类
"""
import time
import requests
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from typing import Dict, List, Any, Optional
import logging
logger = logging.getLogger(__name__)
class BaseSpider(ABC):
"""爬虫基类"""
name = "base"
def __init__(self, config: Dict = None):
self.config = config or {}
self.session = requests.Session()
self.session.headers.update({
"User-Agent": self.config.get("user_agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
})
self.results = []
@abstractmethod
def start_requests(self):
"""生成初始请求"""
pass
@abstractmethod
def parse(self, response, **kwargs):
"""解析响应"""
pass
def request(self, url: str, method: str = "GET", **kwargs) -> Optional[requests.Response]:
"""发送请求"""
retry_times = self.config.get("retry_times", 3)
retry_delay = self.config.get("retry_delay", 2)
timeout = self.config.get("timeout", 30)
for attempt in range(retry_times):
try:
logger.info(f"请求: {url} (尝试 {attempt + 1}/{retry_times})")
response = self.session.request(
method=method,
url=url,
timeout=timeout,
**kwargs
)
response.raise_for_status()
# 请求间隔
delay = self.config.get("request_delay", 1)
if delay > 0:
time.sleep(delay)
return response
except requests.exceptions.RequestException as e:
logger.warning(f"请求失败: {e}")
if attempt < retry_times - 1:
time.sleep(retry_delay)
else:
logger.error(f"请求最终失败: {url}")
return None
return None
def parse_html(self, html: str) -> BeautifulSoup:
"""解析HTML"""
return BeautifulSoup(html, "lxml")
def extract_text(self, element) -> str:
"""提取元素文本"""
if element is None:
return ""
return element.get_text(strip=True)
def extract_attr(self, element, attr: str) -> str:
"""提取元素属性"""
if element is None:
return ""
return element.get(attr, "")
def css_select(self, soup: BeautifulSoup, selector: str) -> List:
"""CSS选择器"""
return soup.select(selector)
def css_select_one(self, soup: BeautifulSoup, selector: str):
"""CSS选择器单个"""
return soup.select_one(selector)
def save_result(self, data: Dict):
"""保存结果"""
self.results.append(data)
def get_results(self) -> List[Dict]:
"""获取所有结果"""
return self.results
def clear_results(self):
"""清空结果"""
self.results = []