-
- {% for article in page_obj %}
-
- - - {{ article.title }} - - - {% empty %} -
- 暂无文章 - {% endfor %} -
diff --git a/core/admin.py b/core/admin.py index 9e86878..3479ae2 100644 --- a/core/admin.py +++ b/core/admin.py @@ -1,9 +1,6 @@ -from django.contrib import admin -from django.contrib.admin import AdminSite from .models import Website, Article # 添加actions相关的导入 from django.contrib import messages -from django.http import HttpResponseRedirect # 添加导出功能所需导入 import csv from django.http import HttpResponse @@ -12,17 +9,7 @@ import json from django.shortcuts import render, redirect from django.urls import path from django.contrib import admin -from django.http import JsonResponse -from django.views.decorators.http import require_http_methods from django.core.management import call_command -import threading -import uuid -from django.utils import timezone - - -# 创建自定义管理站点 - -# 实例化管理站点 # 添加运行爬虫的视图函数 def run_crawler_view(request): @@ -36,21 +23,24 @@ def run_crawler_view(request): return redirect('admin:core_article_changelist') try: - # 根据网站名称确定要执行的爬虫命令 - if website_name == 'crawl_xinhua': - crawler_name = 'crawl_xinhua' - elif website_name == 'crawl_dongfangyancao': - crawler_name = 'crawl_dongfangyancao' - elif website_name == 'crawl_articles': - crawler_name = 'crawl_articles' - else: - # 对于其他网站,使用通用爬虫命令 - crawler_name = 'crawl_articles' + # 动态获取网站对象 + website = Website.objects.get(name=website_name) + + # 根据网站对象确定要执行的爬虫命令 + # 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令 + crawler_name = getattr(website, 'crawler_command', None) + + # 如果网站没有配置爬虫命令,则报错 + if not crawler_name: + messages.error(request, f'网站 {website_name} 未配置爬虫命令') + return redirect('admin:core_article_changelist') - # 运行爬虫命令,不传递website_name作为参数 - call_command(crawler_name) + # 运行爬虫命令,传递网站名称 + call_command(crawler_name, website_name) - messages.success(request, f'成功执行爬虫: {crawler_name}') + messages.success(request, f'成功执行爬虫: {website_name}') + except Website.DoesNotExist: + messages.error(request, f'网站不存在: {website_name}') except Exception as e: messages.error(request, f'执行爬虫失败: {str(e)}') @@ -71,6 +61,10 @@ class ArticleAdmin(admin.ModelAdmin): actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json', 'export_as_word', 'export_with_media'] + def get_websites(self): + """获取所有启用的网站""" + return Website.objects.filter(enabled=True) + # 重写get_urls方法,添加自定义URL def get_urls(self): urls = super().get_urls() diff --git a/core/management/commands/crawl_cngov.py b/core/management/commands/crawl_cngov.py index 40d9146..0bd09e9 100644 --- a/core/management/commands/crawl_cngov.py +++ b/core/management/commands/crawl_cngov.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def handle(self, *args, **kwargs): website, created = Website.objects.get_or_create( - name="www.gov.cn", + name="中国政府网", defaults={ 'article_list_url': 'https://www.gov.cn/', 'article_selector': 'a' diff --git a/core/management/commands/crawl_rmrb.py b/core/management/commands/crawl_rmrb.py new file mode 100644 index 0000000..f9bb02c --- /dev/null +++ b/core/management/commands/crawl_rmrb.py @@ -0,0 +1,26 @@ +from django.core.management.base import BaseCommand +from core.models import Website +from core.utils import full_site_crawler + + +class Command(BaseCommand): + help = "全站递归爬取 人民日报 https://www.peopleapp.com" + + def handle(self, *args, **kwargs): + website, created = Website.objects.get_or_create( + name="人民日报", + defaults={ + 'article_list_url': 'https://www.peopleapp.com/home', + 'article_selector': 'a', + 'base_url': 'https://www.peopleapp.com' + } + ) + # 确保更新已存在的网站对象的base_url + if not created and not website.base_url: + website.base_url = 'https://www.peopleapp.com' + website.save() + + start_url = "https://www.peopleapp.com/home" + self.stdout.write(f"开始全站爬取: {start_url}") + full_site_crawler(start_url, website, max_pages=500) + self.stdout.write("爬取完成") \ No newline at end of file diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index ba9d57f..bada862 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def handle(self, *args, **kwargs): website, created = Website.objects.get_or_create( - name="www.news.cn", + name="新华网", defaults={ 'article_list_url': 'https://www.news.cn/', 'article_selector': 'a' diff --git a/core/management/commands/export_articles.py b/core/management/commands/export_articles.py index f144f0b..bf683d0 100644 --- a/core/management/commands/export_articles.py +++ b/core/management/commands/export_articles.py @@ -4,7 +4,6 @@ import json import csv import os from django.conf import settings -from django.core.files.storage import default_storage import zipfile from django.utils import timezone diff --git a/core/templates/admin/core/article/change_list.html b/core/templates/admin/core/article/change_list.html index 26554d0..7fe0384 100644 --- a/core/templates/admin/core/article/change_list.html +++ b/core/templates/admin/core/article/change_list.html @@ -9,9 +9,9 @@ diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index 896eb2c..5fe39af 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -2,7 +2,8 @@
-