""" 华为应用市场爬虫主程序 """ import asyncio from typing import Optional, List from app.crawler.huawei_api import HuaweiAPI from app.crawler.data_processor import DataProcessor from app.crawler.app_ids import KNOWN_APP_IDS from app.database import AsyncSessionLocal class HuaweiCrawler: """华为应用市场爬虫""" def __init__(self): self.api = HuaweiAPI() async def __aenter__(self): """异步上下文管理器入口""" return self async def __aexit__(self, exc_type, exc_val, exc_tb): """异步上下文管理器出口""" await self.api.close() async def crawl_by_ids( self, id_list: Optional[List[int]] = None, limit: Optional[int] = None, batch_size: int = 50 # 并发批次大小,默认50 ) -> tuple: """ 根据ID列表爬取应用(支持并发) Args: id_list: ID列表,如果为None则使用KNOWN_APP_IDS limit: 限制爬取数量 batch_size: 并发批次大小,默认5个 Returns: (成功数量, 失败数量) """ if id_list is None: id_list = KNOWN_APP_IDS if limit: id_list = id_list[:limit] success_count = 0 failed_count = 0 print("=" * 80) print(f"开始爬取 {len(id_list)} 个应用(并发数: {batch_size})") print("=" * 80) # 分批处理 for batch_start in range(0, len(id_list), batch_size): batch_end = min(batch_start + batch_size, len(id_list)) batch = id_list[batch_start:batch_end] # 并发爬取一批 tasks = [] for i, app_id_num in enumerate(batch, batch_start + 1): app_id = f"C{app_id_num:019d}" tasks.append(self._crawl_single_app(app_id, i, len(id_list))) # 等待这一批完成 results = await asyncio.gather(*tasks, return_exceptions=True) # 统计结果 for result in results: if isinstance(result, Exception): failed_count += 1 elif result: success_count += 1 else: failed_count += 1 # 批次间短暂延迟 if batch_end < len(id_list): await asyncio.sleep(0.2) print("\n" + "=" * 80) print(f"爬取完成: 成功 {success_count} 个, 失败 {failed_count} 个") print("=" * 80) return success_count, failed_count async def _crawl_single_app(self, app_id: str, index: int, total: int) -> bool: """爬取单个应用(每个任务使用独立的数据库会话)""" # 为每个任务创建独立的数据库会话 async with AsyncSessionLocal() as db_session: processor = DataProcessor(db_session) try: print(f"\n[{index}/{total}] {app_id}", end=" ") # 获取应用信息 app_data = await self.api.get_app_info(app_id=app_id) print(f"✓ {app_data['name']}", end=" ") # 获取评分信息 rating_data = await self.api.get_app_rating(app_id) # 保存到数据库 info_inserted, metric_inserted, rating_inserted = await processor.save_app_data( app_data, rating_data ) # 显示保存状态 status_parts = [] if info_inserted: status_parts.append("新应用") if metric_inserted: status_parts.append("新指标") if rating_inserted: status_parts.append("新评分") if status_parts: print(f"→ {', '.join(status_parts)}") else: print(f"→ 无更新") return True except ValueError: print(f"✗ 跳过(安卓应用)") return False except Exception as e: print(f"✗ 失败: {str(e)[:50]}") return False async def crawl_all(): """爬取所有已知应用""" async with HuaweiCrawler() as crawler: return await crawler.crawl_by_ids() async def crawl_limited(limit: int): """爬取指定数量的应用""" async with HuaweiCrawler() as crawler: return await crawler.crawl_by_ids(limit=limit)