Base setup

This commit is contained in:
2025-09-23 13:30:03 +08:00
parent 1057ed8690
commit e51154bb29
34 changed files with 2574 additions and 1 deletions

View File

@@ -0,0 +1,69 @@
from django.core.management.base import BaseCommand
from crawler.models import CrawlTask, Website
from crawler.tasks import crawl_websites_task
class Command(BaseCommand):
help = '运行爬虫任务'
def add_arguments(self, parser):
parser.add_argument(
'--keywords',
type=str,
required=True,
help='搜索关键字,多个关键字用逗号分隔'
)
parser.add_argument(
'--websites',
type=str,
help='网站ID列表用逗号分隔。不指定则爬取所有网站'
)
parser.add_argument(
'--name',
type=str,
help='任务名称'
)
def handle(self, *args, **options):
keywords = options['keywords']
website_ids = options.get('websites')
task_name = options.get('name', f'关键字搜索: {keywords}')
# 获取目标网站
if website_ids:
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
else:
websites = Website.objects.filter(is_active=True)
if not websites.exists():
self.stdout.write(
self.style.ERROR('没有找到可用的网站')
)
return
# 创建任务
task = CrawlTask.objects.create(
name=task_name,
keywords=keywords,
created_by='management_command'
)
task.websites.set(websites)
self.stdout.write(f'创建任务: {task.name}')
self.stdout.write(f'目标网站: {websites.count()}')
self.stdout.write(f'搜索关键字: {keywords}')
# 启动任务同步模式如果没有Redis则直接运行
try:
crawl_websites_task.delay(task.id)
self.stdout.write('任务已提交到队列')
except Exception as e:
self.stdout.write(f'队列不可用,直接运行任务: {e}')
from crawler.crawler_engine import WebsiteCrawler
crawler = WebsiteCrawler(task.id)
crawler.run()
self.stdout.write(
self.style.SUCCESS(f'任务已启动任务ID: {task.id}')
)