Files
ns2.0/backend/app/crawler/crawler.py
Nvex 720402ffe7 feat: NEXT Store 2.0 重大更新 - 完整重构前后端
🎉 主要更新:

后端:
- 全新华为应用市场爬虫系统
- 三表分离数据库设计 (app_info, app_metrics, app_rating)
- 完整的API接口 (搜索、分类、热门、上新等)
- 元服务自动识别和分类
- 智能Token管理和数据处理
- 修复热门应用重复显示问题

前端:
- 全新首页设计 (今日上架、热门应用)
- 应用页面 (彩色分类磁贴、智能图标匹配)
- 今日上新页面 (日期切换)
- 热门应用页面 (卡片布局)
- 应用详情页面 (完整信息展示)
- Apple风格搜索栏
- Footer组件
- 底部导航栏优化 (4个导航项)
- 骨架屏加载效果
- FontAwesome图标集成

UI/UX:
- 统一浅色背景 (#F5F5F7)
- 流畅的过渡动画
- 响应式设计
- 毛玻璃效果

文档:
- CHANGELOG.md - 完整更新日志
- QUICKSTART.md - 快速开始
- 多个技术文档和使用指南

版本: v2.0.0
2025-10-25 21:20:32 +08:00

144 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
华为应用市场爬虫主程序
"""
import asyncio
from typing import Optional, List
from app.crawler.huawei_api import HuaweiAPI
from app.crawler.data_processor import DataProcessor
from app.crawler.app_ids import KNOWN_APP_IDS
from app.database import AsyncSessionLocal
class HuaweiCrawler:
"""华为应用市场爬虫"""
def __init__(self):
self.api = HuaweiAPI()
async def __aenter__(self):
"""异步上下文管理器入口"""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
await self.api.close()
async def crawl_by_ids(
self,
id_list: Optional[List[int]] = None,
limit: Optional[int] = None,
batch_size: int = 50 # 并发批次大小默认50
) -> tuple:
"""
根据ID列表爬取应用支持并发
Args:
id_list: ID列表如果为None则使用KNOWN_APP_IDS
limit: 限制爬取数量
batch_size: 并发批次大小默认5个
Returns:
(成功数量, 失败数量)
"""
if id_list is None:
id_list = KNOWN_APP_IDS
if limit:
id_list = id_list[:limit]
success_count = 0
failed_count = 0
print("=" * 80)
print(f"开始爬取 {len(id_list)} 个应用(并发数: {batch_size}")
print("=" * 80)
# 分批处理
for batch_start in range(0, len(id_list), batch_size):
batch_end = min(batch_start + batch_size, len(id_list))
batch = id_list[batch_start:batch_end]
# 并发爬取一批
tasks = []
for i, app_id_num in enumerate(batch, batch_start + 1):
app_id = f"C{app_id_num:019d}"
tasks.append(self._crawl_single_app(app_id, i, len(id_list)))
# 等待这一批完成
results = await asyncio.gather(*tasks, return_exceptions=True)
# 统计结果
for result in results:
if isinstance(result, Exception):
failed_count += 1
elif result:
success_count += 1
else:
failed_count += 1
# 批次间短暂延迟
if batch_end < len(id_list):
await asyncio.sleep(0.2)
print("\n" + "=" * 80)
print(f"爬取完成: 成功 {success_count} 个, 失败 {failed_count}")
print("=" * 80)
return success_count, failed_count
async def _crawl_single_app(self, app_id: str, index: int, total: int) -> bool:
"""爬取单个应用(每个任务使用独立的数据库会话)"""
# 为每个任务创建独立的数据库会话
async with AsyncSessionLocal() as db_session:
processor = DataProcessor(db_session)
try:
print(f"\n[{index}/{total}] {app_id}", end=" ")
# 获取应用信息
app_data = await self.api.get_app_info(app_id=app_id)
print(f"{app_data['name']}", end=" ")
# 获取评分信息
rating_data = await self.api.get_app_rating(app_id)
# 保存到数据库
info_inserted, metric_inserted, rating_inserted = await processor.save_app_data(
app_data, rating_data
)
# 显示保存状态
status_parts = []
if info_inserted:
status_parts.append("新应用")
if metric_inserted:
status_parts.append("新指标")
if rating_inserted:
status_parts.append("新评分")
if status_parts:
print(f"{', '.join(status_parts)}")
else:
print(f"→ 无更新")
return True
except ValueError:
print(f"✗ 跳过(安卓应用)")
return False
except Exception as e:
print(f"✗ 失败: {str(e)[:50]}")
return False
async def crawl_all():
"""爬取所有已知应用"""
async with HuaweiCrawler() as crawler:
return await crawler.crawl_by_ids()
async def crawl_limited(limit: int):
"""爬取指定数量的应用"""
async with HuaweiCrawler() as crawler:
return await crawler.crawl_by_ids(limit=limit)