ns2.0/backend/app/crawler/crawl.py

#!/usr/bin/env python3
"""
华为应用市场爬虫 - 命令行入口
一键爬取 guess.py 中的所有应用到数据库
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))

import asyncio
import argparse
from app.database import engine, Base
from app.models import AppInfo, AppMetrics, AppRating
from app.crawler.crawler import HuaweiCrawler
from sqlalchemy import text


async def init_database():
    """初始化数据库表（仅在表不存在时创建）"""
    try:
        async with engine.begin() as conn:
            # 检查表是否存在
            result = await conn.execute(text("SHOW TABLES LIKE 'app_info'"))
            exists = result.fetchone()

            if not exists:
                print("数据库表不存在，正在创建...")
                await conn.run_sync(Base.metadata.create_all)
                print("✓ 数据库表创建成功\n")
            # 如果表已存在，不输出任何信息，直接继续
        return True
    except Exception as e:
        print(f"✗ 数据库检查失败: {e}")
        return False


async def main():
    parser = argparse.ArgumentParser(
        description='华为应用市场爬虫 - 一键爬取所有应用到数据库',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例:
  python3 app/crawler/crawl.py                    # 爬取所有应用（默认50并发）
  python3 app/crawler/crawl.py --limit 10         # 只爬取前10个应用
  python3 app/crawler/crawl.py --batch 100        # 使用100并发
  python3 app/crawler/crawl.py --limit 100 --batch 20  # 爬取100个，20并发
        """
    )
    parser.add_argument('--limit', type=int, help='限制爬取数量（默认爬取所有）')
    parser.add_argument('--batch', type=int, default=50, help='并发数量（默认50）')
    parser.add_argument('--skip-init', action='store_true', help='跳过数据库初始化检查')

    args = parser.parse_args()

    try:
        # 自动检查并初始化数据库（仅在表不存在时）
        if not args.skip_init:
            if not await init_database():
                print("\n数据库检查失败，请检查配置后重试")
                return

        # 开始爬取
        async with HuaweiCrawler() as crawler:
            await crawler.crawl_by_ids(limit=args.limit, batch_size=args.batch)

    finally:
        # 清理数据库引擎，避免警告
        await engine.dispose()


if __name__ == "__main__":
    asyncio.run(main())