Files
green_classroom/core/management/commands/crawl_china.py

61 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
# jimmy.fang-20250815: 因URL问题移除中国网-省份
class Command(BaseCommand):
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['china', 'province', 'all'],
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国网各平台配置
platforms = {
'china': {
'name': '中国网',
'base_url': 'http://www.china.com.cn',
'start_url': 'http://www.china.com.cn',
'article_selector': 'a'
},
# 'province': {
# 'name': '中国网一省份',
# 'base_url': 'http://www.china.com.cn',
# 'start_url': 'http://www.china.com.cn/province',
# 'article_selector': 'a'
# }
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国网所有平台爬取完成"))