Files
green_classroom/core/management/commands/crawl_grrb.py

59 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['grrb', 'mobile', 'all'],
help='选择爬取平台: grrb(工人日报), mobile(移动端), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 工人日报各平台配置
platforms = {
'grrb': {
'name': '工人日报',
'base_url': 'http://www.workercn.cn',
'start_url': 'http://www.workercn.cn',
'article_selector': 'a'
},
'mobile': {
'name': '工人日报移动端',
'base_url': 'http://m.workercn.cn', # 修复确保移动端URL正确
'start_url': 'http://m.workercn.cn',
'article_selector': 'a'
}
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("工人日报所有平台爬取完成"))