功能: - 多步骤爬取流程(入口页→列表页→详情页) - 浏览器爬虫支持(Playwright,处理JS渲染) - 比亚迪汽车爬虫示例 - 后台管理界面 - 数据存储和导出 技术栈: - Python 3 + Flask - Playwright (浏览器自动化) - BeautifulSoup (HTML解析) 端口: - API服务: 19011 - 后台管理: 19012
108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
"""
|
||
爬虫基类
|
||
"""
|
||
|
||
import time
|
||
import requests
|
||
from abc import ABC, abstractmethod
|
||
from bs4 import BeautifulSoup
|
||
from typing import Dict, List, Any, Optional
|
||
import logging
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class BaseSpider(ABC):
|
||
"""爬虫基类"""
|
||
|
||
name = "base"
|
||
|
||
def __init__(self, config: Dict = None):
|
||
self.config = config or {}
|
||
self.session = requests.Session()
|
||
self.session.headers.update({
|
||
"User-Agent": self.config.get("user_agent",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||
})
|
||
self.results = []
|
||
|
||
@abstractmethod
|
||
def start_requests(self):
|
||
"""生成初始请求"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def parse(self, response, **kwargs):
|
||
"""解析响应"""
|
||
pass
|
||
|
||
def request(self, url: str, method: str = "GET", **kwargs) -> Optional[requests.Response]:
|
||
"""发送请求"""
|
||
retry_times = self.config.get("retry_times", 3)
|
||
retry_delay = self.config.get("retry_delay", 2)
|
||
timeout = self.config.get("timeout", 30)
|
||
|
||
for attempt in range(retry_times):
|
||
try:
|
||
logger.info(f"请求: {url} (尝试 {attempt + 1}/{retry_times})")
|
||
|
||
response = self.session.request(
|
||
method=method,
|
||
url=url,
|
||
timeout=timeout,
|
||
**kwargs
|
||
)
|
||
response.raise_for_status()
|
||
|
||
# 请求间隔
|
||
delay = self.config.get("request_delay", 1)
|
||
if delay > 0:
|
||
time.sleep(delay)
|
||
|
||
return response
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
logger.warning(f"请求失败: {e}")
|
||
if attempt < retry_times - 1:
|
||
time.sleep(retry_delay)
|
||
else:
|
||
logger.error(f"请求最终失败: {url}")
|
||
return None
|
||
|
||
return None
|
||
|
||
def parse_html(self, html: str) -> BeautifulSoup:
|
||
"""解析HTML"""
|
||
return BeautifulSoup(html, "lxml")
|
||
|
||
def extract_text(self, element) -> str:
|
||
"""提取元素文本"""
|
||
if element is None:
|
||
return ""
|
||
return element.get_text(strip=True)
|
||
|
||
def extract_attr(self, element, attr: str) -> str:
|
||
"""提取元素属性"""
|
||
if element is None:
|
||
return ""
|
||
return element.get(attr, "")
|
||
|
||
def css_select(self, soup: BeautifulSoup, selector: str) -> List:
|
||
"""CSS选择器"""
|
||
return soup.select(selector)
|
||
|
||
def css_select_one(self, soup: BeautifulSoup, selector: str):
|
||
"""CSS选择器(单个)"""
|
||
return soup.select_one(selector)
|
||
|
||
def save_result(self, data: Dict):
|
||
"""保存结果"""
|
||
self.results.append(data)
|
||
|
||
def get_results(self) -> List[Dict]:
|
||
"""获取所有结果"""
|
||
return self.results
|
||
|
||
def clear_results(self):
|
||
"""清空结果"""
|
||
self.results = [] |