Support keword
This commit is contained in:
257
core/management/commands/crawl_all_websites.py
Normal file
257
core/management/commands/crawl_all_websites.py
Normal file
@@ -0,0 +1,257 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
|
||||
from core.models import Website
|
||||
import json
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "一键爬取所有支持的网站"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--mode', '-m',
|
||||
type=str,
|
||||
choices=['full', 'keyword', 'both'],
|
||||
default='full',
|
||||
help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
help='关键词搜索模式下的搜索关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=500,
|
||||
help='全站爬取最大页数 (默认: 500)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-search-pages', '-sp',
|
||||
type=int,
|
||||
default=10,
|
||||
help='关键词搜索最大页数 (默认: 10)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-a',
|
||||
type=int,
|
||||
default=100,
|
||||
help='关键词搜索最大文章数量 (默认: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-existing',
|
||||
action='store_true',
|
||||
help='跳过已存在的网站配置'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
mode = options['mode']
|
||||
keyword = options['keyword']
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_search_pages = options['max_search_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
output_file = options['output']
|
||||
skip_existing = options['skip_existing']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
# 确定要爬取的网站列表
|
||||
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
# 验证关键词模式
|
||||
if mode in ['keyword', 'both'] and not keyword:
|
||||
self.stdout.write(
|
||||
self.style.ERROR("关键词模式需要指定 --keyword 参数")
|
||||
)
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始一键爬取任务...")
|
||||
self.stdout.write(f"爬取模式: {mode}")
|
||||
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
|
||||
if keyword:
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
|
||||
all_results = {
|
||||
"mode": mode,
|
||||
"websites": target_websites,
|
||||
"keyword": keyword,
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"full_crawl_results": {},
|
||||
"keyword_crawl_results": {},
|
||||
"summary": {
|
||||
"total_websites": len(target_websites),
|
||||
"full_crawl_success": 0,
|
||||
"full_crawl_failed": 0,
|
||||
"keyword_crawl_success": 0,
|
||||
"keyword_crawl_failed": 0
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(f"开始处理网站: {website_name}")
|
||||
self.stdout.write(f"{'='*50}")
|
||||
|
||||
# 获取或创建网站对象
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
if not created and skip_existing:
|
||||
self.stdout.write(f"跳过已存在的网站: {website_name}")
|
||||
continue
|
||||
|
||||
website_results = {
|
||||
"full_crawl": None,
|
||||
"keyword_crawl": None
|
||||
}
|
||||
|
||||
# 全站爬取
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"\n开始全站爬取: {website_name}")
|
||||
try:
|
||||
full_site_crawler(
|
||||
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
website,
|
||||
max_pages=max_pages
|
||||
)
|
||||
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
|
||||
website_results["full_crawl"] = {"status": "success"}
|
||||
all_results["summary"]["full_crawl_success"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["full_crawl_failed"] += 1
|
||||
|
||||
# 关键词爬取
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"\n开始关键词爬取: {website_name}")
|
||||
try:
|
||||
keyword_results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=[website_name],
|
||||
max_pages=max_search_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
website_results["keyword_crawl"] = keyword_results
|
||||
if keyword_results["success_count"] > 0:
|
||||
all_results["summary"]["keyword_crawl_success"] += 1
|
||||
else:
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
|
||||
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
|
||||
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
|
||||
|
||||
# 显示最终结果摘要
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
|
||||
self.stdout.write(f"{'='*50}")
|
||||
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['full_crawl_failed']}")
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['keyword_crawl_failed']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站详细结果:")
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{website_name}:")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
full_result = all_results["full_crawl_results"][website_name]
|
||||
if full_result and full_result.get("status") == "success":
|
||||
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
|
||||
elif full_result:
|
||||
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
keyword_result = all_results["keyword_crawl_results"][website_name]
|
||||
if keyword_result and "success_count" in keyword_result:
|
||||
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
|
||||
f"失败 {keyword_result['failed_count']} 篇")
|
||||
elif keyword_result and keyword_result.get("status") == "failed":
|
||||
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
157
core/management/commands/crawl_by_keyword.py
Normal file
157
core/management/commands/crawl_by_keyword.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
|
||||
import json
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "根据关键词爬取多个网站的文章"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
help='搜索关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=10,
|
||||
help='每个网站最大搜索页数 (默认: 10)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-a',
|
||||
type=int,
|
||||
default=100,
|
||||
help='最大文章数量 (默认: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--historical',
|
||||
action='store_true',
|
||||
help='爬取历史文章模式'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
keyword = options['keyword']
|
||||
if not keyword:
|
||||
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
|
||||
return
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
historical = options['historical']
|
||||
output_file = options['output']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取任务...")
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
if websites:
|
||||
self.stdout.write(f"目标网站: {', '.join(websites)}")
|
||||
else:
|
||||
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
|
||||
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
self.stdout.write(f"最大页数: {max_pages}")
|
||||
self.stdout.write(f"最大文章数: {max_articles}")
|
||||
|
||||
try:
|
||||
if historical:
|
||||
# 历史文章爬取模式
|
||||
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
|
||||
results = crawl_historical_articles(
|
||||
website_names=websites,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles_per_site=max_articles
|
||||
)
|
||||
else:
|
||||
# 关键词搜索模式
|
||||
results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=websites,
|
||||
max_pages=max_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
|
||||
# 显示结果摘要
|
||||
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
|
||||
self.stdout.write(f"总文章数: {results['total_articles']}")
|
||||
self.stdout.write(f"成功: {results['success_count']}")
|
||||
self.stdout.write(f"失败: {results['failed_count']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站结果:")
|
||||
for website, result in results['website_results'].items():
|
||||
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
|
||||
self.stdout.write(
|
||||
status(f" {website}: 找到 {result['found_urls']} 篇, "
|
||||
f"成功 {result['success']}, 失败 {result['failed']}")
|
||||
)
|
||||
if 'error' in result:
|
||||
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
Reference in New Issue
Block a user