Base setup

This commit is contained in:
2025-09-23 13:30:03 +08:00
parent 1057ed8690
commit e51154bb29
34 changed files with 2574 additions and 1 deletions

View File

View File

@@ -0,0 +1,36 @@
from django.core.management.base import BaseCommand
from django.conf import settings
from crawler.models import Website
class Command(BaseCommand):
help = '初始化目标网站数据'
def handle(self, *args, **options):
self.stdout.write('开始初始化目标网站数据...')
# 清空现有数据
Website.objects.all().delete()
# 从设置中获取网站列表
websites_data = settings.TARGET_WEBSITES
created_count = 0
for website_data in websites_data:
website, created = Website.objects.get_or_create(
url=website_data['url'],
defaults={
'name': website_data['name'],
'region': website_data['region'],
'is_active': True
}
)
if created:
created_count += 1
self.stdout.write(f'创建网站: {website.name}')
else:
self.stdout.write(f'网站已存在: {website.name}')
self.stdout.write(
self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
)

View File

@@ -0,0 +1,69 @@
from django.core.management.base import BaseCommand
from crawler.models import CrawlTask, Website
from crawler.tasks import crawl_websites_task
class Command(BaseCommand):
help = '运行爬虫任务'
def add_arguments(self, parser):
parser.add_argument(
'--keywords',
type=str,
required=True,
help='搜索关键字,多个关键字用逗号分隔'
)
parser.add_argument(
'--websites',
type=str,
help='网站ID列表用逗号分隔。不指定则爬取所有网站'
)
parser.add_argument(
'--name',
type=str,
help='任务名称'
)
def handle(self, *args, **options):
keywords = options['keywords']
website_ids = options.get('websites')
task_name = options.get('name', f'关键字搜索: {keywords}')
# 获取目标网站
if website_ids:
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
else:
websites = Website.objects.filter(is_active=True)
if not websites.exists():
self.stdout.write(
self.style.ERROR('没有找到可用的网站')
)
return
# 创建任务
task = CrawlTask.objects.create(
name=task_name,
keywords=keywords,
created_by='management_command'
)
task.websites.set(websites)
self.stdout.write(f'创建任务: {task.name}')
self.stdout.write(f'目标网站: {websites.count()}')
self.stdout.write(f'搜索关键字: {keywords}')
# 启动任务同步模式如果没有Redis则直接运行
try:
crawl_websites_task.delay(task.id)
self.stdout.write('任务已提交到队列')
except Exception as e:
self.stdout.write(f'队列不可用,直接运行任务: {e}')
from crawler.crawler_engine import WebsiteCrawler
crawler = WebsiteCrawler(task.id)
crawler.run()
self.stdout.write(
self.style.SUCCESS(f'任务已启动任务ID: {task.id}')
)