Support keword

This commit is contained in:
2025-09-24 03:38:32 +08:00
parent a4891b1c30
commit 8592833d74
16 changed files with 2888 additions and 2 deletions

View File

@@ -0,0 +1,257 @@
from django.core.management.base import BaseCommand
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
from core.models import Website
import json
class Command(BaseCommand):
help = "一键爬取所有支持的网站"
def add_arguments(self, parser):
parser.add_argument(
'--mode', '-m',
type=str,
choices=['full', 'keyword', 'both'],
default='full',
help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
)
parser.add_argument(
'--keyword', '-k',
type=str,
help='关键词搜索模式下的搜索关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=500,
help='全站爬取最大页数 (默认: 500)'
)
parser.add_argument(
'--max-search-pages', '-sp',
type=int,
default=10,
help='关键词搜索最大页数 (默认: 10)'
)
parser.add_argument(
'--max-articles', '-a',
type=int,
default=100,
help='关键词搜索最大文章数量 (默认: 100)'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
parser.add_argument(
'--skip-existing',
action='store_true',
help='跳过已存在的网站配置'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
mode = options['mode']
keyword = options['keyword']
websites = options['websites']
max_pages = options['max_pages']
max_search_pages = options['max_search_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
output_file = options['output']
skip_existing = options['skip_existing']
# 验证网站名称
if websites:
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
# 确定要爬取的网站列表
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
# 验证关键词模式
if mode in ['keyword', 'both'] and not keyword:
self.stdout.write(
self.style.ERROR("关键词模式需要指定 --keyword 参数")
)
return
self.stdout.write(f"开始一键爬取任务...")
self.stdout.write(f"爬取模式: {mode}")
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
if keyword:
self.stdout.write(f"关键词: {keyword}")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
all_results = {
"mode": mode,
"websites": target_websites,
"keyword": keyword,
"start_date": start_date,
"end_date": end_date,
"full_crawl_results": {},
"keyword_crawl_results": {},
"summary": {
"total_websites": len(target_websites),
"full_crawl_success": 0,
"full_crawl_failed": 0,
"keyword_crawl_success": 0,
"keyword_crawl_failed": 0
}
}
try:
for website_name in target_websites:
self.stdout.write(f"\n{'='*50}")
self.stdout.write(f"开始处理网站: {website_name}")
self.stdout.write(f"{'='*50}")
# 获取或创建网站对象
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
if not created and skip_existing:
self.stdout.write(f"跳过已存在的网站: {website_name}")
continue
website_results = {
"full_crawl": None,
"keyword_crawl": None
}
# 全站爬取
if mode in ['full', 'both']:
self.stdout.write(f"\n开始全站爬取: {website_name}")
try:
full_site_crawler(
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
website,
max_pages=max_pages
)
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
website_results["full_crawl"] = {"status": "success"}
all_results["summary"]["full_crawl_success"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["full_crawl_failed"] += 1
# 关键词爬取
if mode in ['keyword', 'both']:
self.stdout.write(f"\n开始关键词爬取: {website_name}")
try:
keyword_results = crawl_by_keyword(
keyword=keyword,
website_names=[website_name],
max_pages=max_search_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
website_results["keyword_crawl"] = keyword_results
if keyword_results["success_count"] > 0:
all_results["summary"]["keyword_crawl_success"] += 1
else:
all_results["summary"]["keyword_crawl_failed"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["keyword_crawl_failed"] += 1
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
# 显示最终结果摘要
self.stdout.write(f"\n{'='*50}")
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
self.stdout.write(f"{'='*50}")
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
if mode in ['full', 'both']:
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
f"失败: {all_results['summary']['full_crawl_failed']}")
if mode in ['keyword', 'both']:
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
f"失败: {all_results['summary']['keyword_crawl_failed']}")
# 显示各网站详细结果
self.stdout.write("\n各网站详细结果:")
for website_name in target_websites:
self.stdout.write(f"\n{website_name}:")
if mode in ['full', 'both']:
full_result = all_results["full_crawl_results"][website_name]
if full_result and full_result.get("status") == "success":
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
elif full_result:
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
if mode in ['keyword', 'both']:
keyword_result = all_results["keyword_crawl_results"][website_name]
if keyword_result and "success_count" in keyword_result:
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
f"失败 {keyword_result['failed_count']}")
elif keyword_result and keyword_result.get("status") == "failed":
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
raise

View File

@@ -0,0 +1,157 @@
from django.core.management.base import BaseCommand
from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
import json
class Command(BaseCommand):
help = "根据关键词爬取多个网站的文章"
def add_arguments(self, parser):
parser.add_argument(
'--keyword', '-k',
type=str,
help='搜索关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=10,
help='每个网站最大搜索页数 (默认: 10)'
)
parser.add_argument(
'--max-articles', '-a',
type=int,
default=100,
help='最大文章数量 (默认: 100)'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--historical',
action='store_true',
help='爬取历史文章模式'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
keyword = options['keyword']
if not keyword:
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
return
websites = options['websites']
max_pages = options['max_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
historical = options['historical']
output_file = options['output']
# 验证网站名称
if websites:
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
self.stdout.write(f"开始爬取任务...")
self.stdout.write(f"关键词: {keyword}")
if websites:
self.stdout.write(f"目标网站: {', '.join(websites)}")
else:
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
self.stdout.write(f"最大页数: {max_pages}")
self.stdout.write(f"最大文章数: {max_articles}")
try:
if historical:
# 历史文章爬取模式
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
results = crawl_historical_articles(
website_names=websites,
start_date=start_date,
end_date=end_date,
max_articles_per_site=max_articles
)
else:
# 关键词搜索模式
results = crawl_by_keyword(
keyword=keyword,
website_names=websites,
max_pages=max_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
# 显示结果摘要
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
self.stdout.write(f"总文章数: {results['total_articles']}")
self.stdout.write(f"成功: {results['success_count']}")
self.stdout.write(f"失败: {results['failed_count']}")
# 显示各网站详细结果
self.stdout.write("\n各网站结果:")
for website, result in results['website_results'].items():
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
self.stdout.write(
status(f" {website}: 找到 {result['found_urls']} 篇, "
f"成功 {result['success']}, 失败 {result['failed']}")
)
if 'error' in result:
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
raise