70 lines
2.3 KiB
Python
70 lines
2.3 KiB
Python
from django.core.management.base import BaseCommand
|
||
from crawler.models import CrawlTask, Website
|
||
from crawler.tasks import crawl_websites_task
|
||
|
||
|
||
class Command(BaseCommand):
|
||
help = '运行爬虫任务'
|
||
|
||
def add_arguments(self, parser):
|
||
parser.add_argument(
|
||
'--keywords',
|
||
type=str,
|
||
required=True,
|
||
help='搜索关键字,多个关键字用逗号分隔'
|
||
)
|
||
parser.add_argument(
|
||
'--websites',
|
||
type=str,
|
||
help='网站ID列表,用逗号分隔。不指定则爬取所有网站'
|
||
)
|
||
parser.add_argument(
|
||
'--name',
|
||
type=str,
|
||
help='任务名称'
|
||
)
|
||
|
||
def handle(self, *args, **options):
|
||
keywords = options['keywords']
|
||
website_ids = options.get('websites')
|
||
task_name = options.get('name', f'关键字搜索: {keywords}')
|
||
|
||
# 获取目标网站
|
||
if website_ids:
|
||
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
|
||
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
|
||
else:
|
||
websites = Website.objects.filter(is_active=True)
|
||
|
||
if not websites.exists():
|
||
self.stdout.write(
|
||
self.style.ERROR('没有找到可用的网站')
|
||
)
|
||
return
|
||
|
||
# 创建任务
|
||
task = CrawlTask.objects.create(
|
||
name=task_name,
|
||
keywords=keywords,
|
||
created_by='management_command'
|
||
)
|
||
task.websites.set(websites)
|
||
|
||
self.stdout.write(f'创建任务: {task.name}')
|
||
self.stdout.write(f'目标网站: {websites.count()} 个')
|
||
self.stdout.write(f'搜索关键字: {keywords}')
|
||
|
||
# 启动任务(同步模式,如果没有Redis则直接运行)
|
||
try:
|
||
crawl_websites_task.delay(task.id)
|
||
self.stdout.write('任务已提交到队列')
|
||
except Exception as e:
|
||
self.stdout.write(f'队列不可用,直接运行任务: {e}')
|
||
from crawler.crawler_engine import WebsiteCrawler
|
||
crawler = WebsiteCrawler(task.id)
|
||
crawler.run()
|
||
|
||
self.stdout.write(
|
||
self.style.SUCCESS(f'任务已启动,任务ID: {task.id}')
|
||
)
|