diff --git a/core/admin.py b/core/admin.py index 9e86878..3479ae2 100644 --- a/core/admin.py +++ b/core/admin.py @@ -1,9 +1,6 @@ -from django.contrib import admin -from django.contrib.admin import AdminSite from .models import Website, Article # 添加actions相关的导入 from django.contrib import messages -from django.http import HttpResponseRedirect # 添加导出功能所需导入 import csv from django.http import HttpResponse @@ -12,17 +9,7 @@ import json from django.shortcuts import render, redirect from django.urls import path from django.contrib import admin -from django.http import JsonResponse -from django.views.decorators.http import require_http_methods from django.core.management import call_command -import threading -import uuid -from django.utils import timezone - - -# 创建自定义管理站点 - -# 实例化管理站点 # 添加运行爬虫的视图函数 def run_crawler_view(request): @@ -36,21 +23,24 @@ def run_crawler_view(request): return redirect('admin:core_article_changelist') try: - # 根据网站名称确定要执行的爬虫命令 - if website_name == 'crawl_xinhua': - crawler_name = 'crawl_xinhua' - elif website_name == 'crawl_dongfangyancao': - crawler_name = 'crawl_dongfangyancao' - elif website_name == 'crawl_articles': - crawler_name = 'crawl_articles' - else: - # 对于其他网站,使用通用爬虫命令 - crawler_name = 'crawl_articles' + # 动态获取网站对象 + website = Website.objects.get(name=website_name) + + # 根据网站对象确定要执行的爬虫命令 + # 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令 + crawler_name = getattr(website, 'crawler_command', None) + + # 如果网站没有配置爬虫命令,则报错 + if not crawler_name: + messages.error(request, f'网站 {website_name} 未配置爬虫命令') + return redirect('admin:core_article_changelist') - # 运行爬虫命令,不传递website_name作为参数 - call_command(crawler_name) + # 运行爬虫命令,传递网站名称 + call_command(crawler_name, website_name) - messages.success(request, f'成功执行爬虫: {crawler_name}') + messages.success(request, f'成功执行爬虫: {website_name}') + except Website.DoesNotExist: + messages.error(request, f'网站不存在: {website_name}') except Exception as e: messages.error(request, f'执行爬虫失败: {str(e)}') @@ -71,6 +61,10 @@ class ArticleAdmin(admin.ModelAdmin): actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json', 'export_as_word', 'export_with_media'] + def get_websites(self): + """获取所有启用的网站""" + return Website.objects.filter(enabled=True) + # 重写get_urls方法,添加自定义URL def get_urls(self): urls = super().get_urls() diff --git a/core/management/commands/crawl_cngov.py b/core/management/commands/crawl_cngov.py index 40d9146..0bd09e9 100644 --- a/core/management/commands/crawl_cngov.py +++ b/core/management/commands/crawl_cngov.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def handle(self, *args, **kwargs): website, created = Website.objects.get_or_create( - name="www.gov.cn", + name="中国政府网", defaults={ 'article_list_url': 'https://www.gov.cn/', 'article_selector': 'a' diff --git a/core/management/commands/crawl_rmrb.py b/core/management/commands/crawl_rmrb.py new file mode 100644 index 0000000..f9bb02c --- /dev/null +++ b/core/management/commands/crawl_rmrb.py @@ -0,0 +1,26 @@ +from django.core.management.base import BaseCommand +from core.models import Website +from core.utils import full_site_crawler + + +class Command(BaseCommand): + help = "全站递归爬取 人民日报 https://www.peopleapp.com" + + def handle(self, *args, **kwargs): + website, created = Website.objects.get_or_create( + name="人民日报", + defaults={ + 'article_list_url': 'https://www.peopleapp.com/home', + 'article_selector': 'a', + 'base_url': 'https://www.peopleapp.com' + } + ) + # 确保更新已存在的网站对象的base_url + if not created and not website.base_url: + website.base_url = 'https://www.peopleapp.com' + website.save() + + start_url = "https://www.peopleapp.com/home" + self.stdout.write(f"开始全站爬取: {start_url}") + full_site_crawler(start_url, website, max_pages=500) + self.stdout.write("爬取完成") \ No newline at end of file diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index ba9d57f..bada862 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def handle(self, *args, **kwargs): website, created = Website.objects.get_or_create( - name="www.news.cn", + name="新华网", defaults={ 'article_list_url': 'https://www.news.cn/', 'article_selector': 'a' diff --git a/core/management/commands/export_articles.py b/core/management/commands/export_articles.py index f144f0b..bf683d0 100644 --- a/core/management/commands/export_articles.py +++ b/core/management/commands/export_articles.py @@ -4,7 +4,6 @@ import json import csv import os from django.conf import settings -from django.core.files.storage import default_storage import zipfile from django.utils import timezone diff --git a/core/templates/admin/core/article/change_list.html b/core/templates/admin/core/article/change_list.html index 26554d0..7fe0384 100644 --- a/core/templates/admin/core/article/change_list.html +++ b/core/templates/admin/core/article/change_list.html @@ -9,9 +9,9 @@ diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index 896eb2c..5fe39af 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -2,7 +2,8 @@ - 绿色课堂文章列表 + + {{ article.title }} - 绿色课堂 -
-

绿色课堂文章列表

+
+ « 返回文章列表 - -
-
- - {% if selected_website %} - - {% endif %} - -
-
+

{{ article.title }}

-
- - - - \ No newline at end of file diff --git a/core/templates/core/article_list.html b/core/templates/core/article_list.html index 18e736b..699430f 100644 --- a/core/templates/core/article_list.html +++ b/core/templates/core/article_list.html @@ -17,7 +17,7 @@ background: white; padding: 30px; margin-bottom: 20px; - box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */ + box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */ border-radius: 8px; /* 添加圆角 */ } @@ -240,7 +240,7 @@
{% if selected_website %} - + {% endif %}
@@ -251,11 +251,9 @@ @@ -264,10 +262,10 @@
{% if search_query %} -
- 搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章 - 清除搜索 -
+
+ 搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章 + 清除搜索 +
{% endif %} @@ -282,70 +280,60 @@
    {% for article in page_obj %} -
  • - - {{ article.title }} -
    ({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})
    -
  • - {% empty %} -
  • 暂无文章
  • +
  • + + {{ article.title }} +
    ({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})
    +
  • + {% empty %} +
  • 暂无文章
  • {% endfor %}
@@ -408,25 +396,25 @@ format: 'json' }) }) - .then(response => { - if (response.ok) { - return response.blob(); - } - throw new Error('导出失败'); - }) - .then(blob => { - const url = window.URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = 'articles.json'; - document.body.appendChild(a); - a.click(); - window.URL.revokeObjectURL(url); - document.body.removeChild(a); - }) - .catch(error => { - alert('导出失败: ' + error); - }); + .then(response => { + if (response.ok) { + return response.blob(); + } + throw new Error('导出失败'); + }) + .then(blob => { + const url = window.URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = 'articles.json'; + document.body.appendChild(a); + a.click(); + window.URL.revokeObjectURL(url); + document.body.removeChild(a); + }) + .catch(error => { + alert('导出失败: ' + error); + }); }); // 导出为CSV功能 @@ -446,25 +434,25 @@ format: 'csv' }) }) - .then(response => { - if (response.ok) { - return response.blob(); - } - throw new Error('导出失败'); - }) - .then(blob => { - const url = window.URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = 'articles.csv'; - document.body.appendChild(a); - a.click(); - window.URL.revokeObjectURL(url); - document.body.removeChild(a); - }) - .catch(error => { - alert('导出失败: ' + error); - }); + .then(response => { + if (response.ok) { + return response.blob(); + } + throw new Error('导出失败'); + }) + .then(blob => { + const url = window.URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = 'articles.csv'; + document.body.appendChild(a); + a.click(); + window.URL.revokeObjectURL(url); + document.body.removeChild(a); + }) + .catch(error => { + alert('导出失败: ' + error); + }); }); // 新增:导出为ZIP包功能 @@ -484,25 +472,25 @@ format: 'zip' // 指定导出格式为ZIP }) }) - .then(response => { - if (response.ok) { - return response.blob(); - } - throw new Error('导出失败'); - }) - .then(blob => { - const url = window.URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = 'articles.zip'; - document.body.appendChild(a); - a.click(); - window.URL.revokeObjectURL(url); - document.body.removeChild(a); - }) - .catch(error => { - alert('导出失败: ' + error); - }); + .then(response => { + if (response.ok) { + return response.blob(); + } + throw new Error('导出失败'); + }) + .then(blob => { + const url = window.URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = 'articles.zip'; + document.body.appendChild(a); + a.click(); + window.URL.revokeObjectURL(url); + document.body.removeChild(a); + }) + .catch(error => { + alert('导出失败: ' + error); + }); }); // 初始化导出按钮状态 diff --git a/core/utils.py b/core/utils.py index 89f43c3..4e1ea84 100644 --- a/core/utils.py +++ b/core/utils.py @@ -107,6 +107,17 @@ def process_article(url, website): soup.find("div", id="content") or soup.find("div", class_="mainBody") ) + elif website.name == "人民日报": + # 人民日报网站的文章结构处理 + title_tag = soup.find("h1") or soup.find("title") + # 查找主要内容区域 + content_tag = ( + soup.find("div", class_="content") or + soup.find("div", class_="article-content") or + soup.find("div", id="content") or + soup.find("div", class_="text") or + soup.find("section", class_="content") + ) else: # 默认处理方式 title_tag = soup.find("h1") or soup.find("title") @@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000): ("/xinwen/" in path) or ("/huoban/" in path) ) + elif website.name == "人民日报": + # 人民日报的文章页面判断逻辑 + parsed_url = urlparse(url) + path = parsed_url.path + # 修改: 增加更准确的文章页面判断逻辑 + is_article_page = ( + (soup.find("div", class_="content") is not None and + soup.find("h1") is not None) or + soup.find("div", class_="article-content") is not None or + (soup.find("div", id="content") is not None and + soup.find("h1") is not None) or + soup.find("div", class_="text") is not None or + soup.find("section", class_="content") is not None or + ("/article/" in path) or + (path.startswith("/detail/") and len(path) > 10) or + # 增加对peopleapp.com特定文章路径的判断 + ("/dynamic/" in path and "article" in path) + ) else: # 默认判断逻辑 is_article_page = ( @@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000): # 扩展队列,发现新链接 for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) - if href not in visited and is_valid_url(href, base_netloc): + # 对于人民日报网站,我们扩展链接发现逻辑 + if website.name == "人民日报": + # 允许爬取以https://www.peopleapp.com/开头的链接 + if href.startswith("https://www.peopleapp.com/") and href not in visited: + # 增加对文章链接的识别 + parsed_href = urlparse(href) + href_path = parsed_href.path + # 添加更多可能的文章链接模式 + if ("/article/" in href_path or + href_path.startswith("/detail/") or + ("/dynamic/" in href_path and "article" in href_path) or + href_path.count("/") > 2): # 更深层的页面可能是文章页 + queue.append(href) + elif href not in visited and is_valid_url(href, base_netloc): queue.append(href) diff --git a/core/views.py b/core/views.py index 81a06f0..c78bbbf 100644 --- a/core/views.py +++ b/core/views.py @@ -22,6 +22,7 @@ def article_list(request): # 获取筛选网站 selected_website = None + # 修改:确保始终获取所有文章,除非有特定筛选 articles = Article.objects.all() website_id = request.GET.get('website') @@ -411,4 +412,4 @@ def export_articles(request): return HttpResponse('不支持的格式', status=400) except Exception as e: - return HttpResponse(f'导出失败: {str(e)}', status=500) + return HttpResponse(f'导出失败: {str(e)}', status=500) \ No newline at end of file