61 lines
2.4 KiB
Python
61 lines
2.4 KiB
Python
from django.core.management.base import BaseCommand
|
||
from core.models import Website
|
||
from core.utils import full_site_crawler
|
||
|
||
|
||
# jimmy.fang-20250815: 因URL问题,移除中国网-省份
|
||
class Command(BaseCommand):
|
||
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
|
||
|
||
def add_arguments(self, parser):
|
||
parser.add_argument('--platform', type=str, default='all',
|
||
choices=['china', 'province', 'all'],
|
||
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
|
||
|
||
def handle(self, *args, **options):
|
||
platform = options['platform']
|
||
|
||
# 中国网各平台配置
|
||
platforms = {
|
||
'china': {
|
||
'name': '中国网',
|
||
'base_url': 'http://www.china.com.cn',
|
||
'start_url': 'http://www.china.com.cn',
|
||
'article_selector': 'a'
|
||
},
|
||
# 'province': {
|
||
# 'name': '中国网一省份',
|
||
# 'base_url': 'http://www.china.com.cn',
|
||
# 'start_url': 'http://www.china.com.cn/province',
|
||
# 'article_selector': 'a'
|
||
# }
|
||
}
|
||
|
||
if platform == 'all':
|
||
target_platforms = platforms.values()
|
||
else:
|
||
target_platforms = [platforms[platform]]
|
||
|
||
for platform_config in target_platforms:
|
||
website, created = Website.objects.get_or_create(
|
||
name=platform_config['name'],
|
||
defaults={
|
||
'base_url': platform_config['base_url'],
|
||
'article_list_url': platform_config['start_url'],
|
||
'article_selector': platform_config['article_selector']
|
||
}
|
||
)
|
||
|
||
# 确保更新已存在的网站对象的配置
|
||
if not created:
|
||
website.base_url = platform_config['base_url']
|
||
website.article_list_url = platform_config['start_url']
|
||
website.article_selector = platform_config['article_selector']
|
||
website.save()
|
||
|
||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||
|
||
self.stdout.write(self.style.SUCCESS("中国网所有平台爬取完成"))
|