🎉 主要更新:
后端:
- 全新华为应用市场爬虫系统
- 三表分离数据库设计 (app_info, app_metrics, app_rating)
- 完整的API接口 (搜索、分类、热门、上新等)
- 元服务自动识别和分类
- 智能Token管理和数据处理
- 修复热门应用重复显示问题
前端:
- 全新首页设计 (今日上架、热门应用)
- 应用页面 (彩色分类磁贴、智能图标匹配)
- 今日上新页面 (日期切换)
- 热门应用页面 (卡片布局)
- 应用详情页面 (完整信息展示)
- Apple风格搜索栏
- Footer组件
- 底部导航栏优化 (4个导航项)
- 骨架屏加载效果
- FontAwesome图标集成
UI/UX:
- 统一浅色背景 (#F5F5F7)
- 流畅的过渡动画
- 响应式设计
- 毛玻璃效果
文档:
- CHANGELOG.md - 完整更新日志
- QUICKSTART.md - 快速开始
- 多个技术文档和使用指南
版本: v2.0.0
144 lines
4.7 KiB
Python
144 lines
4.7 KiB
Python
"""
|
||
华为应用市场爬虫主程序
|
||
"""
|
||
import asyncio
|
||
from typing import Optional, List
|
||
from app.crawler.huawei_api import HuaweiAPI
|
||
from app.crawler.data_processor import DataProcessor
|
||
from app.crawler.app_ids import KNOWN_APP_IDS
|
||
from app.database import AsyncSessionLocal
|
||
|
||
|
||
class HuaweiCrawler:
|
||
"""华为应用市场爬虫"""
|
||
|
||
def __init__(self):
|
||
self.api = HuaweiAPI()
|
||
|
||
async def __aenter__(self):
|
||
"""异步上下文管理器入口"""
|
||
return self
|
||
|
||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||
"""异步上下文管理器出口"""
|
||
await self.api.close()
|
||
|
||
async def crawl_by_ids(
|
||
self,
|
||
id_list: Optional[List[int]] = None,
|
||
limit: Optional[int] = None,
|
||
batch_size: int = 50 # 并发批次大小,默认50
|
||
) -> tuple:
|
||
"""
|
||
根据ID列表爬取应用(支持并发)
|
||
|
||
Args:
|
||
id_list: ID列表,如果为None则使用KNOWN_APP_IDS
|
||
limit: 限制爬取数量
|
||
batch_size: 并发批次大小,默认5个
|
||
|
||
Returns:
|
||
(成功数量, 失败数量)
|
||
"""
|
||
if id_list is None:
|
||
id_list = KNOWN_APP_IDS
|
||
|
||
if limit:
|
||
id_list = id_list[:limit]
|
||
|
||
success_count = 0
|
||
failed_count = 0
|
||
|
||
print("=" * 80)
|
||
print(f"开始爬取 {len(id_list)} 个应用(并发数: {batch_size})")
|
||
print("=" * 80)
|
||
|
||
# 分批处理
|
||
for batch_start in range(0, len(id_list), batch_size):
|
||
batch_end = min(batch_start + batch_size, len(id_list))
|
||
batch = id_list[batch_start:batch_end]
|
||
|
||
# 并发爬取一批
|
||
tasks = []
|
||
for i, app_id_num in enumerate(batch, batch_start + 1):
|
||
app_id = f"C{app_id_num:019d}"
|
||
tasks.append(self._crawl_single_app(app_id, i, len(id_list)))
|
||
|
||
# 等待这一批完成
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
# 统计结果
|
||
for result in results:
|
||
if isinstance(result, Exception):
|
||
failed_count += 1
|
||
elif result:
|
||
success_count += 1
|
||
else:
|
||
failed_count += 1
|
||
|
||
# 批次间短暂延迟
|
||
if batch_end < len(id_list):
|
||
await asyncio.sleep(0.2)
|
||
|
||
print("\n" + "=" * 80)
|
||
print(f"爬取完成: 成功 {success_count} 个, 失败 {failed_count} 个")
|
||
print("=" * 80)
|
||
|
||
return success_count, failed_count
|
||
|
||
async def _crawl_single_app(self, app_id: str, index: int, total: int) -> bool:
|
||
"""爬取单个应用(每个任务使用独立的数据库会话)"""
|
||
# 为每个任务创建独立的数据库会话
|
||
async with AsyncSessionLocal() as db_session:
|
||
processor = DataProcessor(db_session)
|
||
|
||
try:
|
||
print(f"\n[{index}/{total}] {app_id}", end=" ")
|
||
|
||
# 获取应用信息
|
||
app_data = await self.api.get_app_info(app_id=app_id)
|
||
print(f"✓ {app_data['name']}", end=" ")
|
||
|
||
# 获取评分信息
|
||
rating_data = await self.api.get_app_rating(app_id)
|
||
|
||
# 保存到数据库
|
||
info_inserted, metric_inserted, rating_inserted = await processor.save_app_data(
|
||
app_data, rating_data
|
||
)
|
||
|
||
# 显示保存状态
|
||
status_parts = []
|
||
if info_inserted:
|
||
status_parts.append("新应用")
|
||
if metric_inserted:
|
||
status_parts.append("新指标")
|
||
if rating_inserted:
|
||
status_parts.append("新评分")
|
||
|
||
if status_parts:
|
||
print(f"→ {', '.join(status_parts)}")
|
||
else:
|
||
print(f"→ 无更新")
|
||
|
||
return True
|
||
|
||
except ValueError:
|
||
print(f"✗ 跳过(安卓应用)")
|
||
return False
|
||
except Exception as e:
|
||
print(f"✗ 失败: {str(e)[:50]}")
|
||
return False
|
||
|
||
|
||
async def crawl_all():
|
||
"""爬取所有已知应用"""
|
||
async with HuaweiCrawler() as crawler:
|
||
return await crawler.crawl_by_ids()
|
||
|
||
|
||
async def crawl_limited(limit: int):
|
||
"""爬取指定数量的应用"""
|
||
async with HuaweiCrawler() as crawler:
|
||
return await crawler.crawl_by_ids(limit=limit)
|