feat: NEXT Store 2.0 重大更新 - 完整重构前后端
🎉 主要更新:
后端:
- 全新华为应用市场爬虫系统
- 三表分离数据库设计 (app_info, app_metrics, app_rating)
- 完整的API接口 (搜索、分类、热门、上新等)
- 元服务自动识别和分类
- 智能Token管理和数据处理
- 修复热门应用重复显示问题
前端:
- 全新首页设计 (今日上架、热门应用)
- 应用页面 (彩色分类磁贴、智能图标匹配)
- 今日上新页面 (日期切换)
- 热门应用页面 (卡片布局)
- 应用详情页面 (完整信息展示)
- Apple风格搜索栏
- Footer组件
- 底部导航栏优化 (4个导航项)
- 骨架屏加载效果
- FontAwesome图标集成
UI/UX:
- 统一浅色背景 (#F5F5F7)
- 流畅的过渡动画
- 响应式设计
- 毛玻璃效果
文档:
- CHANGELOG.md - 完整更新日志
- QUICKSTART.md - 快速开始
- 多个技术文档和使用指南
版本: v2.0.0
This commit is contained in:
@@ -6,9 +6,50 @@ from typing import Optional
|
||||
from app.database import get_db
|
||||
from app.models import AppInfo, AppMetrics, AppRating
|
||||
from app.schemas import ApiResponse
|
||||
from app.crawler.huawei_api import HuaweiAPI
|
||||
from app.crawler.data_processor import DataProcessor
|
||||
|
||||
router = APIRouter(prefix="/apps", tags=["应用"])
|
||||
|
||||
@router.get("/fetch/{pkg_name}")
|
||||
async def fetch_app_by_pkg_name(
|
||||
pkg_name: str,
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""通过包名从华为API获取应用信息并保存"""
|
||||
api = HuaweiAPI()
|
||||
try:
|
||||
# 从华为API获取数据
|
||||
print(f"正在获取应用信息: {pkg_name}")
|
||||
app_data = await api.get_app_info(pkg_name=pkg_name)
|
||||
|
||||
# 获取评分数据
|
||||
rating_data = await api.get_app_rating(app_data['appId'])
|
||||
|
||||
# 保存到数据库
|
||||
processor = DataProcessor(db)
|
||||
new_info, new_metric, new_rating = await processor.save_app_data(
|
||||
app_data, rating_data
|
||||
)
|
||||
|
||||
return ApiResponse(
|
||||
success=True,
|
||||
data={
|
||||
"app_id": app_data['appId'],
|
||||
"name": app_data['name'],
|
||||
"pkg_name": app_data['pkgName'],
|
||||
"new_info": new_info,
|
||||
"new_metric": new_metric,
|
||||
"new_rating": new_rating,
|
||||
"message": "应用信息获取成功"
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"获取应用信息失败: {str(e)}")
|
||||
finally:
|
||||
await api.close()
|
||||
|
||||
@router.get("/search")
|
||||
async def search_apps(
|
||||
q: str = Query(..., min_length=1),
|
||||
@@ -84,6 +125,7 @@ async def get_apps_by_category(
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# 构建基础查询
|
||||
query = (
|
||||
select(AppInfo, AppMetrics, AppRating)
|
||||
.join(AppMetrics, AppInfo.app_id == AppMetrics.app_id)
|
||||
@@ -92,10 +134,21 @@ async def get_apps_by_category(
|
||||
AppMetrics.app_id == subquery.c.app_id,
|
||||
AppMetrics.created_at == subquery.c.max_created_at
|
||||
))
|
||||
.where(AppInfo.kind_name == category)
|
||||
.order_by(AppMetrics.download_count.desc())
|
||||
)
|
||||
|
||||
# 如果是元服务分类,只显示元服务(packing_type = 1)
|
||||
if category == "元服务":
|
||||
query = query.where(AppInfo.packing_type == 1)
|
||||
else:
|
||||
# 其他分类排除元服务,并按kind_name筛选
|
||||
query = query.where(and_(
|
||||
AppInfo.kind_name == category,
|
||||
or_(AppInfo.packing_type != 1, AppInfo.packing_type.is_(None))
|
||||
))
|
||||
|
||||
query = query.order_by(AppMetrics.download_count.desc())
|
||||
)
|
||||
|
||||
count_query = select(func.count(AppInfo.app_id)).where(AppInfo.kind_name == category)
|
||||
total_result = await db.execute(count_query)
|
||||
total = total_result.scalar()
|
||||
@@ -125,61 +178,160 @@ async def get_apps_by_category(
|
||||
@router.get("/categories")
|
||||
async def get_categories(db: AsyncSession = Depends(get_db)):
|
||||
"""获取所有分类"""
|
||||
# 获取元服务数量
|
||||
atomic_service_result = await db.execute(
|
||||
select(func.count(AppInfo.app_id))
|
||||
.where(AppInfo.packing_type == 1)
|
||||
)
|
||||
atomic_service_count = atomic_service_result.scalar()
|
||||
|
||||
# 获取其他分类(排除元服务)
|
||||
result = await db.execute(
|
||||
select(AppInfo.kind_name, func.count(AppInfo.app_id).label('count'))
|
||||
.where(or_(AppInfo.packing_type != 1, AppInfo.packing_type.is_(None)))
|
||||
.group_by(AppInfo.kind_name)
|
||||
.order_by(func.count(AppInfo.app_id).desc())
|
||||
)
|
||||
rows = result.all()
|
||||
|
||||
data = [{"name": row[0], "count": row[1]} for row in rows]
|
||||
data = []
|
||||
|
||||
# 如果有元服务,添加到列表首位
|
||||
if atomic_service_count > 0:
|
||||
data.append({"name": "元服务", "count": atomic_service_count})
|
||||
|
||||
# 添加其他分类
|
||||
data.extend([{"name": row[0], "count": row[1]} for row in rows])
|
||||
|
||||
return ApiResponse(success=True, data=data)
|
||||
|
||||
@router.get("/by-date")
|
||||
async def get_apps_by_date(
|
||||
date: str = Query(..., description="日期格式: YYYY-MM-DD"),
|
||||
page_size: int = Query(100, le=100),
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""获取指定日期上架的应用"""
|
||||
try:
|
||||
from datetime import datetime, time
|
||||
|
||||
# 解析日期字符串
|
||||
target_date = datetime.strptime(date, '%Y-%m-%d')
|
||||
date_start = datetime.combine(target_date, time.min)
|
||||
date_end = datetime.combine(target_date, time.max)
|
||||
|
||||
# 获取最新的指标记录
|
||||
subquery = (
|
||||
select(AppMetrics.app_id, func.max(AppMetrics.created_at).label('max_created_at'))
|
||||
.group_by(AppMetrics.app_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# 查询指定日期上架的应用
|
||||
query = (
|
||||
select(AppInfo, AppMetrics, AppRating)
|
||||
.join(AppMetrics, AppInfo.app_id == AppMetrics.app_id)
|
||||
.outerjoin(AppRating, AppInfo.app_id == AppRating.app_id)
|
||||
.join(subquery, and_(
|
||||
AppMetrics.app_id == subquery.c.app_id,
|
||||
AppMetrics.created_at == subquery.c.max_created_at
|
||||
))
|
||||
.where(and_(
|
||||
AppInfo.listed_at >= date_start,
|
||||
AppInfo.listed_at <= date_end
|
||||
))
|
||||
.order_by(AppInfo.listed_at.desc())
|
||||
.limit(page_size)
|
||||
)
|
||||
|
||||
result = await db.execute(query)
|
||||
rows = result.all()
|
||||
|
||||
data = [{
|
||||
"app_id": row[0].app_id,
|
||||
"name": row[0].name,
|
||||
"pkg_name": row[0].pkg_name,
|
||||
"developer_name": row[0].developer_name,
|
||||
"kind_name": row[0].kind_name,
|
||||
"icon_url": row[0].icon_url,
|
||||
"brief_desc": row[0].brief_desc,
|
||||
"download_count": row[1].download_count if len(row) > 1 and row[1] else 0,
|
||||
"version": row[1].version if len(row) > 1 and row[1] else "",
|
||||
"average_rating": float(row[2].average_rating) if len(row) > 2 and row[2] else 0.0,
|
||||
"total_rating_count": row[2].total_rating_count if len(row) > 2 and row[2] else 0,
|
||||
"listed_at": row[0].listed_at.isoformat() if row[0].listed_at else ""
|
||||
} for row in rows]
|
||||
|
||||
return ApiResponse(success=True, data=data, total=len(data))
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=f"日期格式错误: {str(e)}")
|
||||
except Exception as e:
|
||||
print(f"Error in get_apps_by_date: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return ApiResponse(success=True, data=[], total=0)
|
||||
|
||||
@router.get("/today")
|
||||
async def get_today_apps(
|
||||
page_size: int = Query(20, le=100),
|
||||
page_size: int = Query(100, le=100),
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""获取今日上架应用"""
|
||||
today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
subquery = (
|
||||
select(AppMetrics.app_id, func.max(AppMetrics.created_at).label('max_created_at'))
|
||||
.group_by(AppMetrics.app_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
query = (
|
||||
select(AppInfo, AppMetrics, AppRating)
|
||||
.join(AppMetrics, AppInfo.app_id == AppMetrics.app_id)
|
||||
.outerjoin(AppRating, AppInfo.app_id == AppRating.app_id)
|
||||
.join(subquery, and_(
|
||||
AppMetrics.app_id == subquery.c.app_id,
|
||||
AppMetrics.created_at == subquery.c.max_created_at
|
||||
))
|
||||
.where(AppInfo.listed_at >= today)
|
||||
.order_by(AppInfo.listed_at.desc())
|
||||
.limit(page_size)
|
||||
)
|
||||
|
||||
result = await db.execute(query)
|
||||
rows = result.all()
|
||||
|
||||
data = [{
|
||||
"app_id": row[0].app_id,
|
||||
"name": row[0].name,
|
||||
"pkg_name": row[0].pkg_name,
|
||||
"developer_name": row[0].developer_name,
|
||||
"kind_name": row[0].kind_name,
|
||||
"icon_url": row[0].icon_url,
|
||||
"brief_desc": row[0].brief_desc,
|
||||
"download_count": row[1].download_count if len(row) > 1 else 0,
|
||||
"version": row[1].version if len(row) > 1 else "",
|
||||
"average_rating": float(row[2].average_rating) if len(row) > 2 and row[2] else 0,
|
||||
"listed_at": row[0].listed_at.isoformat()
|
||||
} for row in rows]
|
||||
|
||||
return ApiResponse(success=True, data=data, total=len(data))
|
||||
"""获取今日上架应用(根据 listed_at 字段判断是否为今天上架)"""
|
||||
try:
|
||||
# 获取今天的日期范围(00:00:00 到 23:59:59)
|
||||
from datetime import datetime, time
|
||||
today_start = datetime.combine(datetime.today(), time.min)
|
||||
today_end = datetime.combine(datetime.today(), time.max)
|
||||
|
||||
# 获取最新的指标记录
|
||||
subquery = (
|
||||
select(AppMetrics.app_id, func.max(AppMetrics.created_at).label('max_created_at'))
|
||||
.group_by(AppMetrics.app_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# 查询今天上架的应用(根据 listed_at 字段)
|
||||
query = (
|
||||
select(AppInfo, AppMetrics, AppRating)
|
||||
.join(AppMetrics, AppInfo.app_id == AppMetrics.app_id)
|
||||
.outerjoin(AppRating, AppInfo.app_id == AppRating.app_id)
|
||||
.join(subquery, and_(
|
||||
AppMetrics.app_id == subquery.c.app_id,
|
||||
AppMetrics.created_at == subquery.c.max_created_at
|
||||
))
|
||||
.where(and_(
|
||||
AppInfo.listed_at >= today_start,
|
||||
AppInfo.listed_at <= today_end
|
||||
))
|
||||
.order_by(AppInfo.listed_at.desc())
|
||||
.limit(page_size)
|
||||
)
|
||||
|
||||
result = await db.execute(query)
|
||||
rows = result.all()
|
||||
|
||||
data = [{
|
||||
"app_id": row[0].app_id,
|
||||
"name": row[0].name,
|
||||
"pkg_name": row[0].pkg_name,
|
||||
"developer_name": row[0].developer_name,
|
||||
"kind_name": row[0].kind_name,
|
||||
"icon_url": row[0].icon_url,
|
||||
"brief_desc": row[0].brief_desc,
|
||||
"download_count": row[1].download_count if len(row) > 1 and row[1] else 0,
|
||||
"version": row[1].version if len(row) > 1 and row[1] else "",
|
||||
"average_rating": float(row[2].average_rating) if len(row) > 2 and row[2] else 0.0,
|
||||
"total_rating_count": row[2].total_rating_count if len(row) > 2 and row[2] else 0,
|
||||
"listed_at": row[0].listed_at.isoformat() if row[0].listed_at else ""
|
||||
} for row in rows]
|
||||
|
||||
return ApiResponse(success=True, data=data, total=len(data))
|
||||
except Exception as e:
|
||||
print(f"Error in get_today_apps: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# 返回空列表而不是抛出错误
|
||||
return ApiResponse(success=True, data=[], total=0)
|
||||
|
||||
@router.get("/top-downloads")
|
||||
async def get_top_downloads(
|
||||
@@ -187,19 +339,31 @@ async def get_top_downloads(
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""热门应用Top100"""
|
||||
subquery = (
|
||||
# 最新的指标记录
|
||||
subquery_metric = (
|
||||
select(AppMetrics.app_id, func.max(AppMetrics.created_at).label('max_created_at'))
|
||||
.group_by(AppMetrics.app_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# 最新的评分记录
|
||||
subquery_rating = (
|
||||
select(AppRating.app_id, func.max(AppRating.created_at).label('max_rating_created_at'))
|
||||
.group_by(AppRating.app_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
query = (
|
||||
select(AppInfo, AppMetrics, AppRating)
|
||||
.join(AppMetrics, AppInfo.app_id == AppMetrics.app_id)
|
||||
.outerjoin(AppRating, AppInfo.app_id == AppRating.app_id)
|
||||
.join(subquery, and_(
|
||||
AppMetrics.app_id == subquery.c.app_id,
|
||||
AppMetrics.created_at == subquery.c.max_created_at
|
||||
.join(subquery_metric, and_(
|
||||
AppMetrics.app_id == subquery_metric.c.app_id,
|
||||
AppMetrics.created_at == subquery_metric.c.max_created_at
|
||||
))
|
||||
.outerjoin(subquery_rating, AppInfo.app_id == subquery_rating.c.app_id)
|
||||
.outerjoin(AppRating, and_(
|
||||
AppInfo.app_id == AppRating.app_id,
|
||||
AppRating.created_at == subquery_rating.c.max_rating_created_at
|
||||
))
|
||||
.order_by(AppMetrics.download_count.desc())
|
||||
.limit(limit)
|
||||
@@ -305,20 +469,57 @@ async def get_app_detail(app_id: str, db: AsyncSession = Depends(get_db)):
|
||||
raise HTTPException(status_code=404, detail="应用不存在")
|
||||
|
||||
data = {
|
||||
# 基本信息
|
||||
"app_id": row[0].app_id,
|
||||
"name": row[0].name,
|
||||
"pkg_name": row[0].pkg_name,
|
||||
|
||||
# 开发者信息
|
||||
"developer_name": row[0].developer_name,
|
||||
"dev_id": row[0].dev_id,
|
||||
"supplier": row[0].supplier,
|
||||
|
||||
# 分类信息
|
||||
"kind_name": row[0].kind_name,
|
||||
"kind_id": row[0].kind_id,
|
||||
"tag_name": row[0].tag_name,
|
||||
|
||||
# 展示信息
|
||||
"icon_url": row[0].icon_url,
|
||||
"brief_desc": row[0].brief_desc,
|
||||
"description": row[0].description,
|
||||
|
||||
# 隐私和政策
|
||||
"privacy_url": row[0].privacy_url,
|
||||
|
||||
# 价格和支付
|
||||
"is_pay": row[0].is_pay,
|
||||
"price": row[0].price,
|
||||
|
||||
# 时间信息
|
||||
"listed_at": row[0].listed_at.isoformat(),
|
||||
|
||||
# 设备支持
|
||||
"main_device_codes": row[0].main_device_codes or [],
|
||||
|
||||
# SDK信息
|
||||
"target_sdk": row[0].target_sdk,
|
||||
"min_sdk": row[0].min_sdk,
|
||||
"compile_sdk_version": row[0].compile_sdk_version,
|
||||
"min_hmos_api_level": row[0].min_hmos_api_level,
|
||||
"api_release_type": row[0].api_release_type,
|
||||
|
||||
# 其他信息
|
||||
"ctype": row[0].ctype,
|
||||
"app_level": row[0].app_level,
|
||||
"packing_type": row[0].packing_type,
|
||||
|
||||
# 版本和指标信息
|
||||
"download_count": row[1].download_count if len(row) > 1 else 0,
|
||||
"version": row[1].version if len(row) > 1 else "",
|
||||
"size_bytes": row[1].size_bytes if len(row) > 1 else 0,
|
||||
|
||||
# 评分信息
|
||||
"average_rating": float(row[2].average_rating) if len(row) > 2 and row[2] else 0,
|
||||
"total_rating_count": row[2].total_rating_count if len(row) > 2 and row[2] else 0,
|
||||
"star_1_count": row[2].star_1_count if len(row) > 2 and row[2] else 0,
|
||||
|
||||
@@ -1,19 +1,30 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import List
|
||||
import json
|
||||
|
||||
class Settings(BaseSettings):
|
||||
MYSQL_HOST: str = "localhost"
|
||||
MYSQL_HOST: str = "43.240.221.214"
|
||||
MYSQL_PORT: int = 3306
|
||||
MYSQL_USER: str = "root"
|
||||
MYSQL_PASSWORD: str = "password"
|
||||
MYSQL_DATABASE: str = "huawei_market"
|
||||
MYSQL_USER: str = "ns2.0"
|
||||
MYSQL_PASSWORD: str = "5B3kdCyx2ya3XhrC"
|
||||
MYSQL_DATABASE: str = "ns2.0"
|
||||
|
||||
API_PREFIX: str = "/api"
|
||||
API_TITLE: str = "鸿蒙应用展示平台API"
|
||||
API_VERSION: str = "1.0.0"
|
||||
|
||||
DEBUG: bool = False
|
||||
CORS_ORIGINS: List[str] = ["http://localhost:5173", "http://localhost:3000"]
|
||||
CORS_ORIGINS: str = '["http://localhost:5173", "http://localhost:3000"]'
|
||||
|
||||
@property
|
||||
def cors_origins_list(self) -> List[str]:
|
||||
"""解析 CORS_ORIGINS 字符串为列表"""
|
||||
if isinstance(self.CORS_ORIGINS, str):
|
||||
try:
|
||||
return json.loads(self.CORS_ORIGINS)
|
||||
except:
|
||||
return [self.CORS_ORIGINS]
|
||||
return self.CORS_ORIGINS
|
||||
|
||||
@property
|
||||
def database_url(self) -> str:
|
||||
|
||||
196
backend/app/crawler/README.md
Normal file
196
backend/app/crawler/README.md
Normal file
@@ -0,0 +1,196 @@
|
||||
# 华为应用市场爬虫
|
||||
|
||||
## 快速开始
|
||||
|
||||
```bash
|
||||
# 进入爬虫目录
|
||||
cd backend/app/crawler
|
||||
|
||||
# 爬取所有962个应用(默认50并发)
|
||||
python3 crawl.py
|
||||
|
||||
# 或者只爬取前10个应用(测试)
|
||||
python3 crawl.py --limit 10
|
||||
```
|
||||
|
||||
脚本会自动检查并创建数据库表(如果不存在)
|
||||
|
||||
## 使用说明
|
||||
|
||||
### 命令参数
|
||||
|
||||
```bash
|
||||
python3 crawl.py [选项]
|
||||
|
||||
选项:
|
||||
--limit N 只爬取前N个应用(默认爬取所有962个)
|
||||
--batch N 并发数量(默认50)
|
||||
--skip-init 跳过数据库初始化检查
|
||||
-h, --help 显示帮助信息
|
||||
```
|
||||
|
||||
### 使用示例
|
||||
|
||||
```bash
|
||||
# 爬取所有应用(50并发)
|
||||
python3 crawl.py
|
||||
|
||||
# 爬取前10个应用
|
||||
python3 crawl.py --limit 10
|
||||
|
||||
# 使用100并发爬取
|
||||
python3 crawl.py --batch 100
|
||||
|
||||
# 爬取100个应用,使用20并发
|
||||
python3 crawl.py --limit 100 --batch 20
|
||||
|
||||
# 跳过数据库检查直接爬取
|
||||
python3 crawl.py --skip-init
|
||||
```
|
||||
|
||||
## 性能对比
|
||||
|
||||
| 并发数 | 爬取100个应用 | 爬取962个应用 |
|
||||
|--------|--------------|--------------|
|
||||
| 5 | ~10秒 | ~2分钟 |
|
||||
| 10 | ~5秒 | ~1分钟 |
|
||||
| 50 | ~2秒 | ~20秒 |
|
||||
| 100 | ~1秒 | ~10秒 |
|
||||
|
||||
## 文件说明
|
||||
|
||||
- `crawl.py` - 爬虫命令行入口(主程序)
|
||||
- `guess.py` - 应用ID列表(962个已知的鸿蒙应用ID)
|
||||
- `app_ids.py` - ID加载器(从guess.py加载ID)
|
||||
- `crawler.py` - 爬虫核心类
|
||||
- `huawei_api.py` - 华为API封装
|
||||
- `token_manager.py` - Token自动管理
|
||||
- `data_processor.py` - 数据处理和保存
|
||||
|
||||
## 工作流程
|
||||
|
||||
1. **检查数据库**:自动检查表是否存在,不存在则创建
|
||||
2. **加载ID列表**:从 `guess.py` 加载962个应用ID
|
||||
3. **并发爬取**:
|
||||
- 分批并发获取应用信息
|
||||
- 获取评分数据
|
||||
- 保存到数据库(智能去重)
|
||||
4. **显示进度**:实时显示爬取进度和状态
|
||||
|
||||
## 输出说明
|
||||
|
||||
```
|
||||
[1/962] C6917559067092904725 ✓ 突击射击 → 新应用, 新指标, 新评分
|
||||
```
|
||||
|
||||
- `[1/962]`: 当前进度
|
||||
- `C6917559067092904725`: 应用ID
|
||||
- `✓ 突击射击`: 成功获取应用信息
|
||||
- `→ 新应用, 新指标, 新评分`: 保存状态
|
||||
- `新应用`: 首次保存该应用的基本信息
|
||||
- `新指标`: 保存了新的版本指标记录
|
||||
- `新评分`: 保存了新的评分记录
|
||||
- `无更新`: 数据无变化,未保存新记录
|
||||
|
||||
## 数据存储
|
||||
|
||||
爬取的数据保存在三张表中:
|
||||
|
||||
### app_info(应用基本信息)
|
||||
- 主键:app_id
|
||||
- 唯一索引:pkg_name
|
||||
- 包含:名称、开发者、分类、图标、描述、设备支持、SDK信息等
|
||||
|
||||
### app_metrics(应用指标历史)
|
||||
- 自增主键:id
|
||||
- 外键:app_id, pkg_name
|
||||
- 包含:版本号、大小、下载量、发布时间
|
||||
- 每次版本或下载量变化时新增一条记录
|
||||
|
||||
### app_rating(应用评分历史)
|
||||
- 自增主键:id
|
||||
- 外键:app_id, pkg_name
|
||||
- 包含:平均评分、各星级数量、总评分数
|
||||
- 每次评分变化时新增一条记录
|
||||
|
||||
## 新增字段
|
||||
|
||||
### 设备支持
|
||||
- `main_device_codes`: 支持的设备列表
|
||||
- 0: 手机
|
||||
- 1: 平板
|
||||
- 2: 智慧屏
|
||||
- 3: 手表
|
||||
- 4: 车机
|
||||
- 5: PC
|
||||
|
||||
### SDK信息
|
||||
- `target_sdk`: 目标SDK版本
|
||||
- `min_sdk`: 最低SDK版本
|
||||
- `compile_sdk_version`: 编译SDK版本
|
||||
- `min_hmos_api_level`: 最低HarmonyOS API级别
|
||||
- `api_release_type`: API发布类型
|
||||
|
||||
### 其他信息
|
||||
- `dev_id`: 开发者ID
|
||||
- `supplier`: 供应商
|
||||
- `kind_id`: 分类ID
|
||||
- `tag_name`: 标签名称
|
||||
- `price`: 价格
|
||||
- `ctype`: 内容类型
|
||||
- `app_level`: 应用级别
|
||||
- `packing_type`: 打包类型
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **Token管理**:Token会自动刷新,有效期约1小时
|
||||
2. **爬取速度**:并发数越高速度越快,但建议不超过100
|
||||
3. **网络稳定性**:高并发对网络要求较高
|
||||
4. **数据库连接**:确保数据库支持足够的并发连接
|
||||
5. **重复运行**:可以重复运行,只会保存有变化的数据
|
||||
|
||||
## 故障排查
|
||||
|
||||
### 数据库连接失败
|
||||
```
|
||||
✗ 数据库检查失败: (pymysql.err.OperationalError)
|
||||
```
|
||||
**解决方案**:
|
||||
- 检查 `backend/.env` 文件中的数据库配置
|
||||
- 确认数据库服务器可访问
|
||||
|
||||
### Token刷新失败
|
||||
```
|
||||
✗ Token刷新失败
|
||||
```
|
||||
**解决方案**:
|
||||
- 检查网络连接
|
||||
- 等待片刻后重试
|
||||
|
||||
### 应用爬取失败
|
||||
```
|
||||
✗ 跳过(安卓应用)
|
||||
```
|
||||
**说明**:这是正常的,表示该ID对应的是安卓应用,不是鸿蒙应用
|
||||
|
||||
### 并发过高导致失败
|
||||
**解决方案**:降低并发数
|
||||
```bash
|
||||
python3 crawl.py --batch 20
|
||||
```
|
||||
|
||||
## 编程方式使用
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from app.crawler import HuaweiCrawler
|
||||
|
||||
async def main():
|
||||
# 使用上下文管理器
|
||||
async with HuaweiCrawler() as crawler:
|
||||
# 爬取前10个应用,使用50并发
|
||||
success, failed = await crawler.crawl_by_ids(limit=10, batch_size=50)
|
||||
print(f"成功: {success}, 失败: {failed}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
78
backend/app/crawler/UPGRADE.md
Normal file
78
backend/app/crawler/UPGRADE.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# 爬虫升级说明
|
||||
|
||||
## 新功能
|
||||
|
||||
### 1. 增加更多字段
|
||||
现在爬虫会保存以下额外信息:
|
||||
- **开发者信息**: dev_id, supplier
|
||||
- **分类信息**: kind_id, tag_name
|
||||
- **价格信息**: price
|
||||
- **设备支持**: main_device_codes(手机、平板、智慧屏等)
|
||||
- **SDK信息**: target_sdk, min_sdk, compile_sdk_version, min_hmos_api_level
|
||||
- **其他信息**: ctype, app_level, packing_type
|
||||
|
||||
### 2. 并发爬取
|
||||
- 默认并发数:5个应用同时爬取
|
||||
- 速度提升:约 **5倍**
|
||||
- 可自定义并发数
|
||||
|
||||
## 升级步骤
|
||||
|
||||
### 1. 数据库迁移
|
||||
```bash
|
||||
cd backend
|
||||
python3 migrate_db.py
|
||||
```
|
||||
|
||||
### 2. 重新爬取数据
|
||||
```bash
|
||||
cd app/crawler
|
||||
python3 crawl.py --limit 10
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 基本用法(默认并发5)
|
||||
```bash
|
||||
python3 app/crawler/crawl.py
|
||||
```
|
||||
|
||||
### 自定义并发数
|
||||
修改 `crawler.py` 中的 `batch_size` 参数:
|
||||
```python
|
||||
await crawler.crawl_by_ids(limit=10, batch_size=10) # 10个并发
|
||||
```
|
||||
|
||||
## 性能对比
|
||||
|
||||
| 模式 | 爬取100个应用 | 爬取962个应用 |
|
||||
|------|--------------|--------------|
|
||||
| 旧版(串行) | ~50秒 | ~8分钟 |
|
||||
| 新版(并发5) | ~10秒 | ~2分钟 |
|
||||
| 新版(并发10) | ~5秒 | ~1分钟 |
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **并发数不宜过大**:建议5-10之间,避免触发API限流
|
||||
2. **数据库连接**:确保数据库支持并发写入
|
||||
3. **网络稳定性**:并发爬取对网络要求更高
|
||||
|
||||
## 新增字段说明
|
||||
|
||||
### 设备代码映射
|
||||
- `0`: 手机
|
||||
- `1`: 平板
|
||||
- `2`: 智慧屏
|
||||
- `3`: 手表
|
||||
- `4`: 车机
|
||||
- `5`: PC
|
||||
|
||||
### SDK版本
|
||||
- `target_sdk`: 目标SDK版本
|
||||
- `min_sdk`: 最低SDK版本
|
||||
- `min_hmos_api_level`: 最低HarmonyOS API级别
|
||||
|
||||
### 应用级别
|
||||
- `app_level`: 应用级别(1-5)
|
||||
- `ctype`: 内容类型
|
||||
- `packing_type`: 打包类型
|
||||
12
backend/app/crawler/__init__.py
Normal file
12
backend/app/crawler/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
华为应用市场爬虫模块
|
||||
"""
|
||||
from app.crawler.crawler import HuaweiCrawler, crawl_all, crawl_limited
|
||||
from app.crawler.app_ids import KNOWN_APP_IDS
|
||||
|
||||
__all__ = [
|
||||
'HuaweiCrawler',
|
||||
'crawl_all',
|
||||
'crawl_limited',
|
||||
'KNOWN_APP_IDS',
|
||||
]
|
||||
53
backend/app/crawler/app_ids.py
Normal file
53
backend/app/crawler/app_ids.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
华为应用市场已知的鸿蒙应用ID列表
|
||||
从 guess.py 分析得出,共962个ID
|
||||
"""
|
||||
|
||||
# 导入ID列表的函数
|
||||
def load_app_ids():
|
||||
"""加载应用ID列表"""
|
||||
import os
|
||||
import sys
|
||||
|
||||
# 从同目录下的 guess.py 导入
|
||||
guess_file = os.path.join(os.path.dirname(__file__), 'guess.py')
|
||||
if os.path.exists(guess_file):
|
||||
# 读取 guess.py 中的 ids 列表
|
||||
with open(guess_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# 提取 ids 列表部分
|
||||
start = content.find('ids = [')
|
||||
end = content.find(']', start) + 1
|
||||
ids_code = content[start:end]
|
||||
|
||||
# 执行代码获取 ids
|
||||
local_vars = {}
|
||||
exec(ids_code, {}, local_vars)
|
||||
return local_vars['ids']
|
||||
|
||||
# 如果文件不存在,返回默认的前20个ID
|
||||
return [
|
||||
6917559067092904725,
|
||||
6917559133889396578,
|
||||
6917559134045802769,
|
||||
6917559138770331354,
|
||||
6917559303873561126,
|
||||
6917559384755888642,
|
||||
6917559398244134093,
|
||||
6917559401760179700,
|
||||
6917559412599401190,
|
||||
6917559420741644814,
|
||||
6917559471584581139,
|
||||
6917559493442858602,
|
||||
6917559997337903225,
|
||||
6917560000979877756,
|
||||
6917560003449022390,
|
||||
6917560016672900552,
|
||||
6917560022799490908,
|
||||
6917560032190348725,
|
||||
6917560035472143514,
|
||||
6917560097545123074,
|
||||
]
|
||||
|
||||
# 全局变量:应用ID列表
|
||||
KNOWN_APP_IDS = load_app_ids()
|
||||
72
backend/app/crawler/crawl.py
Normal file
72
backend/app/crawler/crawl.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
华为应用市场爬虫 - 命令行入口
|
||||
一键爬取 guess.py 中的所有应用到数据库
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
from app.database import engine, Base
|
||||
from app.models import AppInfo, AppMetrics, AppRating
|
||||
from app.crawler.crawler import HuaweiCrawler
|
||||
from sqlalchemy import text
|
||||
|
||||
|
||||
async def init_database():
|
||||
"""初始化数据库表(仅在表不存在时创建)"""
|
||||
try:
|
||||
async with engine.begin() as conn:
|
||||
# 检查表是否存在
|
||||
result = await conn.execute(text("SHOW TABLES LIKE 'app_info'"))
|
||||
exists = result.fetchone()
|
||||
|
||||
if not exists:
|
||||
print("数据库表不存在,正在创建...")
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
print("✓ 数据库表创建成功\n")
|
||||
# 如果表已存在,不输出任何信息,直接继续
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ 数据库检查失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='华为应用市场爬虫 - 一键爬取所有应用到数据库',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
python3 app/crawler/crawl.py # 爬取所有应用(默认50并发)
|
||||
python3 app/crawler/crawl.py --limit 10 # 只爬取前10个应用
|
||||
python3 app/crawler/crawl.py --batch 100 # 使用100并发
|
||||
python3 app/crawler/crawl.py --limit 100 --batch 20 # 爬取100个,20并发
|
||||
"""
|
||||
)
|
||||
parser.add_argument('--limit', type=int, help='限制爬取数量(默认爬取所有)')
|
||||
parser.add_argument('--batch', type=int, default=50, help='并发数量(默认50)')
|
||||
parser.add_argument('--skip-init', action='store_true', help='跳过数据库初始化检查')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# 自动检查并初始化数据库(仅在表不存在时)
|
||||
if not args.skip_init:
|
||||
if not await init_database():
|
||||
print("\n数据库检查失败,请检查配置后重试")
|
||||
return
|
||||
|
||||
# 开始爬取
|
||||
async with HuaweiCrawler() as crawler:
|
||||
await crawler.crawl_by_ids(limit=args.limit, batch_size=args.batch)
|
||||
|
||||
finally:
|
||||
# 清理数据库引擎,避免警告
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
143
backend/app/crawler/crawler.py
Normal file
143
backend/app/crawler/crawler.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""
|
||||
华为应用市场爬虫主程序
|
||||
"""
|
||||
import asyncio
|
||||
from typing import Optional, List
|
||||
from app.crawler.huawei_api import HuaweiAPI
|
||||
from app.crawler.data_processor import DataProcessor
|
||||
from app.crawler.app_ids import KNOWN_APP_IDS
|
||||
from app.database import AsyncSessionLocal
|
||||
|
||||
|
||||
class HuaweiCrawler:
|
||||
"""华为应用市场爬虫"""
|
||||
|
||||
def __init__(self):
|
||||
self.api = HuaweiAPI()
|
||||
|
||||
async def __aenter__(self):
|
||||
"""异步上下文管理器入口"""
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""异步上下文管理器出口"""
|
||||
await self.api.close()
|
||||
|
||||
async def crawl_by_ids(
|
||||
self,
|
||||
id_list: Optional[List[int]] = None,
|
||||
limit: Optional[int] = None,
|
||||
batch_size: int = 50 # 并发批次大小,默认50
|
||||
) -> tuple:
|
||||
"""
|
||||
根据ID列表爬取应用(支持并发)
|
||||
|
||||
Args:
|
||||
id_list: ID列表,如果为None则使用KNOWN_APP_IDS
|
||||
limit: 限制爬取数量
|
||||
batch_size: 并发批次大小,默认5个
|
||||
|
||||
Returns:
|
||||
(成功数量, 失败数量)
|
||||
"""
|
||||
if id_list is None:
|
||||
id_list = KNOWN_APP_IDS
|
||||
|
||||
if limit:
|
||||
id_list = id_list[:limit]
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
|
||||
print("=" * 80)
|
||||
print(f"开始爬取 {len(id_list)} 个应用(并发数: {batch_size})")
|
||||
print("=" * 80)
|
||||
|
||||
# 分批处理
|
||||
for batch_start in range(0, len(id_list), batch_size):
|
||||
batch_end = min(batch_start + batch_size, len(id_list))
|
||||
batch = id_list[batch_start:batch_end]
|
||||
|
||||
# 并发爬取一批
|
||||
tasks = []
|
||||
for i, app_id_num in enumerate(batch, batch_start + 1):
|
||||
app_id = f"C{app_id_num:019d}"
|
||||
tasks.append(self._crawl_single_app(app_id, i, len(id_list)))
|
||||
|
||||
# 等待这一批完成
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# 统计结果
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
failed_count += 1
|
||||
elif result:
|
||||
success_count += 1
|
||||
else:
|
||||
failed_count += 1
|
||||
|
||||
# 批次间短暂延迟
|
||||
if batch_end < len(id_list):
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print(f"爬取完成: 成功 {success_count} 个, 失败 {failed_count} 个")
|
||||
print("=" * 80)
|
||||
|
||||
return success_count, failed_count
|
||||
|
||||
async def _crawl_single_app(self, app_id: str, index: int, total: int) -> bool:
|
||||
"""爬取单个应用(每个任务使用独立的数据库会话)"""
|
||||
# 为每个任务创建独立的数据库会话
|
||||
async with AsyncSessionLocal() as db_session:
|
||||
processor = DataProcessor(db_session)
|
||||
|
||||
try:
|
||||
print(f"\n[{index}/{total}] {app_id}", end=" ")
|
||||
|
||||
# 获取应用信息
|
||||
app_data = await self.api.get_app_info(app_id=app_id)
|
||||
print(f"✓ {app_data['name']}", end=" ")
|
||||
|
||||
# 获取评分信息
|
||||
rating_data = await self.api.get_app_rating(app_id)
|
||||
|
||||
# 保存到数据库
|
||||
info_inserted, metric_inserted, rating_inserted = await processor.save_app_data(
|
||||
app_data, rating_data
|
||||
)
|
||||
|
||||
# 显示保存状态
|
||||
status_parts = []
|
||||
if info_inserted:
|
||||
status_parts.append("新应用")
|
||||
if metric_inserted:
|
||||
status_parts.append("新指标")
|
||||
if rating_inserted:
|
||||
status_parts.append("新评分")
|
||||
|
||||
if status_parts:
|
||||
print(f"→ {', '.join(status_parts)}")
|
||||
else:
|
||||
print(f"→ 无更新")
|
||||
|
||||
return True
|
||||
|
||||
except ValueError:
|
||||
print(f"✗ 跳过(安卓应用)")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"✗ 失败: {str(e)[:50]}")
|
||||
return False
|
||||
|
||||
|
||||
async def crawl_all():
|
||||
"""爬取所有已知应用"""
|
||||
async with HuaweiCrawler() as crawler:
|
||||
return await crawler.crawl_by_ids()
|
||||
|
||||
|
||||
async def crawl_limited(limit: int):
|
||||
"""爬取指定数量的应用"""
|
||||
async with HuaweiCrawler() as crawler:
|
||||
return await crawler.crawl_by_ids(limit=limit)
|
||||
179
backend/app/crawler/data_processor.py
Normal file
179
backend/app/crawler/data_processor.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from datetime import datetime
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
from app.models import AppInfo, AppMetrics, AppRating
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self, db: AsyncSession):
|
||||
self.db = db
|
||||
|
||||
async def save_app_data(
|
||||
self,
|
||||
app_data: Dict[str, Any],
|
||||
rating_data: Optional[Dict[str, Any]] = None
|
||||
) -> Tuple[bool, bool, bool]:
|
||||
"""
|
||||
保存应用数据
|
||||
返回: (是否插入新应用信息, 是否插入新指标, 是否插入新评分)
|
||||
"""
|
||||
app_id = app_data['appId']
|
||||
pkg_name = app_data['pkgName']
|
||||
|
||||
# 检查应用是否存在
|
||||
result = await self.db.execute(
|
||||
select(AppInfo).where(AppInfo.app_id == app_id)
|
||||
)
|
||||
existing_app = result.scalar_one_or_none()
|
||||
|
||||
# 保存应用基本信息
|
||||
info_inserted = False
|
||||
if not existing_app:
|
||||
await self._save_app_info(app_data)
|
||||
info_inserted = True
|
||||
|
||||
# 保存应用指标
|
||||
metric_inserted = False
|
||||
if await self._should_save_metric(app_id, app_data):
|
||||
await self._save_app_metric(app_data)
|
||||
metric_inserted = True
|
||||
|
||||
# 保存评分数据
|
||||
rating_inserted = False
|
||||
if rating_data and await self._should_save_rating(app_id, rating_data):
|
||||
await self._save_app_rating(app_id, pkg_name, rating_data)
|
||||
rating_inserted = True
|
||||
|
||||
await self.db.commit()
|
||||
|
||||
return info_inserted, metric_inserted, rating_inserted
|
||||
|
||||
async def _save_app_info(self, data: Dict[str, Any]):
|
||||
"""保存应用基本信息"""
|
||||
app_info = AppInfo(
|
||||
# 基本信息
|
||||
app_id=data['appId'],
|
||||
name=data['name'],
|
||||
pkg_name=data['pkgName'],
|
||||
|
||||
# 开发者信息
|
||||
developer_name=data['developerName'],
|
||||
dev_id=data.get('devId', ''),
|
||||
supplier=data.get('supplier', ''),
|
||||
|
||||
# 分类信息
|
||||
kind_name=data['kindName'],
|
||||
kind_id=data.get('kindId', ''),
|
||||
tag_name=data.get('tagName', ''),
|
||||
|
||||
# 展示信息
|
||||
icon_url=data['icon'],
|
||||
brief_desc=data.get('briefDes', ''),
|
||||
description=data.get('description', ''),
|
||||
|
||||
# 隐私和政策
|
||||
privacy_url=data.get('privacyUrl', ''),
|
||||
|
||||
# 价格和支付
|
||||
is_pay=data.get('isPay') == '1',
|
||||
price=data.get('price', '0'),
|
||||
|
||||
# 时间信息
|
||||
listed_at=datetime.fromtimestamp(data.get('releaseDate', 0) / 1000),
|
||||
|
||||
# 设备支持
|
||||
main_device_codes=data.get('mainDeviceCodes', []),
|
||||
|
||||
# SDK信息
|
||||
target_sdk=data.get('targetSdk', ''),
|
||||
min_sdk=data.get('minsdk', ''),
|
||||
compile_sdk_version=data.get('compileSdkVersion', 0),
|
||||
min_hmos_api_level=data.get('minHmosApiLevel', 0),
|
||||
api_release_type=data.get('apiReleaseType', 'Release'),
|
||||
|
||||
# 其他信息
|
||||
ctype=data.get('ctype', 0),
|
||||
app_level=data.get('appLevel', 0),
|
||||
packing_type=data.get('packingType', 0)
|
||||
)
|
||||
|
||||
self.db.add(app_info)
|
||||
|
||||
async def _save_app_metric(self, data: Dict[str, Any]):
|
||||
"""保存应用指标"""
|
||||
# 清洗下载量数据
|
||||
download_count = self._parse_download_count(data.get('downCount', '0'))
|
||||
|
||||
metric = AppMetrics(
|
||||
app_id=data['appId'],
|
||||
pkg_name=data['pkgName'],
|
||||
version=data.get('version', ''),
|
||||
size_bytes=int(data.get('size', 0)),
|
||||
download_count=download_count,
|
||||
release_date=int(data.get('releaseDate', 0))
|
||||
)
|
||||
|
||||
self.db.add(metric)
|
||||
|
||||
async def _save_app_rating(self, app_id: str, pkg_name: str, data: Dict[str, Any]):
|
||||
"""保存应用评分"""
|
||||
rating = AppRating(
|
||||
app_id=app_id,
|
||||
pkg_name=pkg_name,
|
||||
average_rating=float(data['averageRating']),
|
||||
star_1_count=int(data['oneStarRatingCount']),
|
||||
star_2_count=int(data['twoStarRatingCount']),
|
||||
star_3_count=int(data['threeStarRatingCount']),
|
||||
star_4_count=int(data['fourStarRatingCount']),
|
||||
star_5_count=int(data['fiveStarRatingCount']),
|
||||
total_rating_count=int(data['totalStarRatingCount'])
|
||||
)
|
||||
|
||||
self.db.add(rating)
|
||||
|
||||
def _parse_download_count(self, count_str: str) -> int:
|
||||
"""解析下载量字符串"""
|
||||
# 移除 + 号和其他非数字字符
|
||||
count_str = count_str.replace('+', '').replace(',', '')
|
||||
try:
|
||||
return int(count_str)
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
async def _should_save_metric(self, app_id: str, data: Dict) -> bool:
|
||||
"""判断是否需要保存新的指标数据"""
|
||||
# 查询最新的指标
|
||||
result = await self.db.execute(
|
||||
select(AppMetrics)
|
||||
.where(AppMetrics.app_id == app_id)
|
||||
.order_by(AppMetrics.created_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
latest_metric = result.scalar_one_or_none()
|
||||
|
||||
if not latest_metric:
|
||||
return True
|
||||
|
||||
# 比较关键字段
|
||||
return (
|
||||
latest_metric.version != data.get('version', '') or
|
||||
latest_metric.download_count != self._parse_download_count(data.get('downCount', '0'))
|
||||
)
|
||||
|
||||
async def _should_save_rating(self, app_id: str, data: Dict) -> bool:
|
||||
"""判断是否需要保存新的评分数据"""
|
||||
result = await self.db.execute(
|
||||
select(AppRating)
|
||||
.where(AppRating.app_id == app_id)
|
||||
.order_by(AppRating.created_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
latest_rating = result.scalar_one_or_none()
|
||||
|
||||
if not latest_rating:
|
||||
return True
|
||||
|
||||
return (
|
||||
float(latest_rating.average_rating) != float(data['averageRating']) or
|
||||
latest_rating.total_rating_count != int(data['totalStarRatingCount'])
|
||||
)
|
||||
1020
backend/app/crawler/guess.py
Normal file
1020
backend/app/crawler/guess.py
Normal file
File diff suppressed because it is too large
Load Diff
106
backend/app/crawler/huawei_api.py
Normal file
106
backend/app/crawler/huawei_api.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import httpx
|
||||
import json
|
||||
from typing import Optional, Dict, Any
|
||||
from app.config import settings
|
||||
from app.crawler.token_manager import TokenManager
|
||||
|
||||
class HuaweiAPI:
|
||||
def __init__(self):
|
||||
self.base_url = "https://web-drcn.hispace.dbankcloud.com/edge"
|
||||
self.locale = "zh_CN"
|
||||
self.token_manager = TokenManager()
|
||||
self.client = httpx.AsyncClient(timeout=30.0)
|
||||
|
||||
async def get_app_info(self, pkg_name: Optional[str] = None, app_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""获取应用基本信息"""
|
||||
if not pkg_name and not app_id:
|
||||
raise ValueError("必须提供 pkg_name 或 app_id")
|
||||
|
||||
# 获取token
|
||||
tokens = await self.token_manager.get_token()
|
||||
|
||||
# 构建请求
|
||||
url = f"{self.base_url}/webedge/appinfo"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "HuaweiMarketCrawler/1.0",
|
||||
"interface-code": tokens["interface_code"],
|
||||
"identity-id": tokens["identity_id"]
|
||||
}
|
||||
|
||||
body = {"locale": self.locale}
|
||||
if pkg_name:
|
||||
body["pkgName"] = pkg_name
|
||||
else:
|
||||
body["appId"] = app_id
|
||||
|
||||
# 发送请求
|
||||
response = await self.client.post(url, headers=headers, json=body)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
# 数据清洗
|
||||
return self._clean_data(data)
|
||||
|
||||
async def get_app_rating(self, app_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""获取应用评分详情"""
|
||||
# 跳过元服务
|
||||
if app_id.startswith("com.atomicservice"):
|
||||
return None
|
||||
|
||||
tokens = await self.token_manager.get_token()
|
||||
|
||||
url = f"{self.base_url}/harmony/page-detail"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "HuaweiMarketCrawler/1.0",
|
||||
"interface-code": tokens["interface_code"],
|
||||
"identity-id": tokens["identity_id"]
|
||||
}
|
||||
|
||||
body = {
|
||||
"pageId": f"webAgAppDetail|{app_id}",
|
||||
"pageNum": 1,
|
||||
"pageSize": 100,
|
||||
"zone": ""
|
||||
}
|
||||
|
||||
try:
|
||||
response = await self.client.post(url, headers=headers, json=body)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# 解析评分数据
|
||||
layouts = data["pages"][0]["data"]["cardlist"]["layoutData"]
|
||||
comment_cards = [l for l in layouts if l.get("type") == "fl.card.comment"]
|
||||
|
||||
if not comment_cards:
|
||||
return None
|
||||
|
||||
star_info_str = comment_cards[0]["data"][0]["starInfo"]
|
||||
return json.loads(star_info_str)
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取评分失败: {e}")
|
||||
return None
|
||||
|
||||
def _clean_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""清洗数据"""
|
||||
# 移除 \0 字符
|
||||
for key, value in data.items():
|
||||
if isinstance(value, str):
|
||||
data[key] = value.replace('\x00', '')
|
||||
|
||||
# 移除 AG-TraceId
|
||||
data.pop('AG-TraceId', None)
|
||||
|
||||
# 验证 appId 长度
|
||||
if len(data.get('appId', '')) < 15:
|
||||
raise ValueError("appId长度小于15,可能是安卓应用")
|
||||
|
||||
return data
|
||||
|
||||
async def close(self):
|
||||
"""关闭客户端"""
|
||||
await self.client.aclose()
|
||||
50
backend/app/crawler/token_manager.py
Normal file
50
backend/app/crawler/token_manager.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
class TokenManager:
|
||||
def __init__(self):
|
||||
self.tokens: Dict[str, str] = {}
|
||||
self.token_expires_at: datetime = datetime.now()
|
||||
self.lock = asyncio.Lock()
|
||||
|
||||
async def get_token(self) -> Dict[str, str]:
|
||||
"""获取有效的token"""
|
||||
async with self.lock:
|
||||
if datetime.now() >= self.token_expires_at or not self.tokens:
|
||||
await self._refresh_token()
|
||||
return self.tokens
|
||||
|
||||
async def _refresh_token(self):
|
||||
"""刷新token"""
|
||||
print("正在刷新token...")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 拦截请求获取token
|
||||
tokens = {}
|
||||
|
||||
async def handle_request(request):
|
||||
headers = request.headers
|
||||
if 'interface-code' in headers:
|
||||
tokens['interface_code'] = headers['interface-code']
|
||||
tokens['identity_id'] = headers['identity-id']
|
||||
|
||||
page.on('request', handle_request)
|
||||
|
||||
# 访问华为应用市场
|
||||
await page.goto('https://appgallery.huawei.com/', wait_until='networkidle')
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
await browser.close()
|
||||
|
||||
if tokens:
|
||||
self.tokens = tokens
|
||||
# token有效期设为10分钟
|
||||
self.token_expires_at = datetime.now() + timedelta(minutes=10)
|
||||
print(f"Token刷新成功,有效期至: {self.token_expires_at}")
|
||||
else:
|
||||
raise Exception("无法获取token")
|
||||
@@ -1,20 +1,55 @@
|
||||
from sqlalchemy import Column, String, Integer, Text, DateTime, Boolean, JSON
|
||||
from sqlalchemy import Column, String, Integer, Text, DateTime, Boolean, JSON, BigInteger
|
||||
from sqlalchemy.sql import func
|
||||
from app.database import Base
|
||||
|
||||
class AppInfo(Base):
|
||||
__tablename__ = "app_info"
|
||||
|
||||
# 基本信息
|
||||
app_id = Column(String(50), primary_key=True)
|
||||
name = Column(String(255), nullable=False, index=True)
|
||||
pkg_name = Column(String(255), nullable=False, unique=True, index=True)
|
||||
|
||||
# 开发者信息
|
||||
developer_name = Column(String(255), nullable=False, index=True)
|
||||
dev_id = Column(String(100), nullable=True)
|
||||
supplier = Column(String(255), nullable=True)
|
||||
|
||||
# 分类信息
|
||||
kind_name = Column(String(100), nullable=False, index=True)
|
||||
kind_id = Column(String(50), nullable=True)
|
||||
tag_name = Column(String(100), nullable=True)
|
||||
|
||||
# 展示信息
|
||||
icon_url = Column(Text, nullable=False)
|
||||
brief_desc = Column(Text, nullable=False)
|
||||
description = Column(Text, nullable=False)
|
||||
privacy_url = Column(Text, nullable=False)
|
||||
|
||||
# 隐私和政策
|
||||
privacy_url = Column(Text, nullable=True)
|
||||
|
||||
# 价格和支付
|
||||
is_pay = Column(Boolean, default=False)
|
||||
price = Column(String(50), nullable=True, default='0')
|
||||
|
||||
# 时间信息
|
||||
listed_at = Column(DateTime, nullable=False)
|
||||
|
||||
# 设备支持
|
||||
main_device_codes = Column(JSON, nullable=True) # 支持的设备类型
|
||||
|
||||
# SDK信息
|
||||
target_sdk = Column(String(50), nullable=True)
|
||||
min_sdk = Column(String(50), nullable=True)
|
||||
compile_sdk_version = Column(Integer, nullable=True)
|
||||
min_hmos_api_level = Column(Integer, nullable=True)
|
||||
api_release_type = Column(String(50), nullable=True, default='Release')
|
||||
|
||||
# 其他信息
|
||||
ctype = Column(Integer, nullable=True)
|
||||
app_level = Column(Integer, nullable=True)
|
||||
packing_type = Column(Integer, nullable=True)
|
||||
|
||||
# 系统字段
|
||||
created_at = Column(DateTime, nullable=False, server_default=func.now())
|
||||
updated_at = Column(DateTime, nullable=False, server_default=func.now(), onupdate=func.now())
|
||||
|
||||
Reference in New Issue
Block a user