Base setup
This commit is contained in:
0
crawler/management/__init__.py
Normal file
0
crawler/management/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.conf import settings
|
||||
from crawler.models import Website
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '初始化目标网站数据'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write('开始初始化目标网站数据...')
|
||||
|
||||
# 清空现有数据
|
||||
Website.objects.all().delete()
|
||||
|
||||
# 从设置中获取网站列表
|
||||
websites_data = settings.TARGET_WEBSITES
|
||||
|
||||
created_count = 0
|
||||
for website_data in websites_data:
|
||||
website, created = Website.objects.get_or_create(
|
||||
url=website_data['url'],
|
||||
defaults={
|
||||
'name': website_data['name'],
|
||||
'region': website_data['region'],
|
||||
'is_active': True
|
||||
}
|
||||
)
|
||||
if created:
|
||||
created_count += 1
|
||||
self.stdout.write(f'创建网站: {website.name}')
|
||||
else:
|
||||
self.stdout.write(f'网站已存在: {website.name}')
|
||||
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
|
||||
)
|
||||
69
crawler/management/commands/run_crawler.py
Normal file
69
crawler/management/commands/run_crawler.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from crawler.models import CrawlTask, Website
|
||||
from crawler.tasks import crawl_websites_task
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '运行爬虫任务'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--keywords',
|
||||
type=str,
|
||||
required=True,
|
||||
help='搜索关键字,多个关键字用逗号分隔'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--websites',
|
||||
type=str,
|
||||
help='网站ID列表,用逗号分隔。不指定则爬取所有网站'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--name',
|
||||
type=str,
|
||||
help='任务名称'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
keywords = options['keywords']
|
||||
website_ids = options.get('websites')
|
||||
task_name = options.get('name', f'关键字搜索: {keywords}')
|
||||
|
||||
# 获取目标网站
|
||||
if website_ids:
|
||||
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
|
||||
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
|
||||
else:
|
||||
websites = Website.objects.filter(is_active=True)
|
||||
|
||||
if not websites.exists():
|
||||
self.stdout.write(
|
||||
self.style.ERROR('没有找到可用的网站')
|
||||
)
|
||||
return
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=task_name,
|
||||
keywords=keywords,
|
||||
created_by='management_command'
|
||||
)
|
||||
task.websites.set(websites)
|
||||
|
||||
self.stdout.write(f'创建任务: {task.name}')
|
||||
self.stdout.write(f'目标网站: {websites.count()} 个')
|
||||
self.stdout.write(f'搜索关键字: {keywords}')
|
||||
|
||||
# 启动任务(同步模式,如果没有Redis则直接运行)
|
||||
try:
|
||||
crawl_websites_task.delay(task.id)
|
||||
self.stdout.write('任务已提交到队列')
|
||||
except Exception as e:
|
||||
self.stdout.write(f'队列不可用,直接运行任务: {e}')
|
||||
from crawler.crawler_engine import WebsiteCrawler
|
||||
crawler = WebsiteCrawler(task.id)
|
||||
crawler.run()
|
||||
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f'任务已启动,任务ID: {task.id}')
|
||||
)
|
||||
Reference in New Issue
Block a user