Add Support dongfangyaocao

2025-08-11 22:20:19 +08:00
parent 6d80326a4e
commit d9d2ea9d99
11 changed files with 686 additions and 58 deletions
--- a/core/admin.py
+++ b/core/admin.py
@@ -1,11 +1,234 @@
 from django.contrib import admin
 from django.contrib.admin import AdminSite
 from .models import Website, Article
 # 添加actions相关的导入
 from django.contrib import messages
 from django.http import HttpResponseRedirect
 # 添加导出功能所需导入
 import csv
 from django.http import HttpResponse
 import json
 # 创建自定义管理站点
 class NewsCnAdminSite(AdminSite):
    site_header = "新华网管理后台"
    site_title = "新华网管理"
    index_title = "新华网内容管理"
 class DongfangyancaoAdminSite(AdminSite):
    site_header = "东方烟草报管理后台"
    site_title = "东方烟草报管理"
    index_title = "东方烟草报内容管理"
 # 实例化管理站点
 news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
 dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
@admin.register(Website)
 class WebsiteAdmin(admin.ModelAdmin):
    list_display = ('name', 'base_url', 'enabled')
 # 为ArticleAdmin添加自定义动作
@admin.register(Article)
 class ArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'website', 'pub_date')
    search_fields = ('title', 'content')
    # 添加动作选项
    actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json']
    def delete_dongfangyancao_articles(self, request, queryset):
        """一键删除东方烟草报的所有文章"""
        # 获取东方烟草报网站对象
        try:
            dongfangyancao_website = Website.objects.get(name='东方烟草报')
            # 删除所有东方烟草报的文章
            deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
            self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
        except Website.DoesNotExist:
            self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
    # 设置动作的显示名称
    delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields]
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in field_names]
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
 # 为不同网站创建专门的文章管理类
 class NewsCnArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'pub_date')
    search_fields = ('title', 'content')
    list_filter = ('pub_date',)
    actions = ['export_as_csv', 'export_as_json']
    def get_queryset(self, request):
        qs = super().get_queryset(request)
        # 只显示新华网的文章
        return qs.filter(website__name='www.news.cn')
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = []
            for field in field_names:
                value = getattr(obj, field)
                if callable(value):
                    value = value()
                if field == 'website':
                    value = value.name
                row.append(value)
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
 class DongfangyancaoArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'pub_date')
    search_fields = ('title', 'content')
    list_filter = ('pub_date',)
    # 添加动作选项
    actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
    def get_queryset(self, request):
        qs = super().get_queryset(request)
        # 只显示东方烟草报的文章
        return qs.filter(website__name='东方烟草报')
    def delete_all_articles(self, request, queryset):
        """删除当前筛选的所有文章（东方烟草报的所有文章）"""
        # 删除所有东方烟草报的文章
        deleted_count = self.get_queryset(request).delete()[0]
        self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
    # 设置动作的显示名称
    delete_all_articles.short_description = "删除所有当前筛选的文章"
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = []
            for field in field_names:
                value = getattr(obj, field)
                if callable(value):
                    value = value()
                if field == 'website':
                    value = value.name
                row.append(value)
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
 # 在各自的管理站点中注册模型
 news_cn_admin.register(Website, WebsiteAdmin)
 news_cn_admin.register(Article, NewsCnArticleAdmin)
 dongfangyancao_admin.register(Website, WebsiteAdmin)
 dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)
--- a/core/management/commands/crawl_dongfangyancao.py
+++ b/core/management/commands/crawl_dongfangyancao.py
@@ -1,20 +1,21 @@
-# core/management/commands/crawl_full_site.py
+# core/management/commands/crawl_dongfangyancao.py
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = "全站递归爬取 www.news.cn"
+    help = "全站递归爬取 东方烟草报"
    def handle(self, *args, **kwargs):
        website, created = Website.objects.get_or_create(
-            name="www.news.cn",
+            name="东方烟草报",
            defaults={
-                'article_list_url': 'https://www.news.cn/',
+                'article_list_url': 'https://www.eastobacco.com/',
                'article_selector': 'a'
            }
        )
-        start_url = "https://www.news.cn/"
+        start_url = "https://www.eastobacco.com/"
        self.stdout.write(f"开始全站爬取: {start_url}")
        full_site_crawler(start_url, website, max_pages=500)
        self.stdout.write("爬取完成")
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -1,18 +1,21 @@
 # core/management/commands/crawl_xinhua.py
 from django.core.management.base import BaseCommand
 from core.models import Website
-from core.utils import crawl_xinhua_list
+from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = '批量爬取新华网文章'
+    help = "全站递归爬取 www.news.cn"
-    def handle(self, *args, **options):
+    def handle(self, *args, **kwargs):
-        list_url = "https://www.news.cn/legal/index.html"
+        website, created = Website.objects.get_or_create(
-        try:
+            name="www.news.cn",
-            website = Website.objects.get(base_url="https://www.news.cn/")
+            defaults={
-        except Website.DoesNotExist:
+                'article_list_url': 'https://www.news.cn/',
-            self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在，请先后台添加"))
+                'article_selector': 'a'
-            return
+            }
-
+        )
-        self.stdout.write(f"开始爬取文章列表页: {list_url}")
+        start_url = "https://www.news.cn/"
-        crawl_xinhua_list(list_url, website)
+        self.stdout.write(f"开始全站爬取: {start_url}")
-        self.stdout.write(self.style.SUCCESS("批量爬取完成"))
+        full_site_crawler(start_url, website, max_pages=500)
        self.stdout.write("爬取完成")
--- a/core/management/commands/crawl_xinhua_bak.py
+++ b/core/management/commands/crawl_xinhua_bak.py
@@ -0,0 +1,21 @@
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import crawl_xinhua_list
 class Command(BaseCommand):
    help = '批量爬取新华网文章'
    def handle(self, *args, **options):
        # 添加使用标记，确认该命令是否被调用
        self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
        list_url = "https://www.news.cn/legal/index.html"
        try:
            website = Website.objects.get(base_url="https://www.news.cn/")
        except Website.DoesNotExist:
            self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在，请先后台添加"))
            return
        self.stdout.write(f"开始爬取文章列表页: {list_url}")
        crawl_xinhua_list(list_url, website)
        self.stdout.write(self.style.SUCCESS("批量爬取完成"))
--- a/core/management/commands/export_articles.py
+++ b/core/management/commands/export_articles.py
@@ -0,0 +1,130 @@
 from django.core.management.base import BaseCommand
 from core.models import Article, Website
 import json
 import csv
 import os
 from django.conf import settings
 from django.core.files.storage import default_storage
 import zipfile
 from django.utils import timezone
 class Command(BaseCommand):
    help = '导出文章及相关的媒体文件（图片、视频等）'
    def add_arguments(self, parser):
        parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
        parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
        parser.add_argument('--output', type=str, default='', help='输出文件路径')
        parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
    def handle(self, *args, **options):
        format_type = options['format'].lower()
        website_name = options['website']
        output_path = options['output']
        include_media = options['include_media']
        # 获取文章查询集
        articles = Article.objects.all()
        if website_name:
            try:
                website = Website.objects.get(name=website_name)
                articles = articles.filter(website=website)
            except Website.DoesNotExist:
                self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
                return
        if not articles.exists():
            self.stdout.write(self.style.WARNING('没有找到文章'))
            return
        # 准备导出数据
        articles_data = []
        media_files = []
        for article in articles:
            article_data = {
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.isoformat() if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.isoformat(),
                'media_files': article.media_files
            }
            articles_data.append(article_data)
            # 收集媒体文件路径
            if include_media:
                for media_path in article.media_files:
                    full_path = os.path.join(settings.MEDIA_ROOT, media_path)
                    if os.path.exists(full_path):
                        media_files.append(full_path)
        # 确定输出路径
        if not output_path:
            timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
            if include_media:
                output_path = f'articles_export_{timestamp}.zip'
            else:
                output_path = f'articles_export_{timestamp}.{format_type}'
        # 执行导出
        if include_media:
            self.export_with_media(articles_data, media_files, output_path, format_type)
        else:
            if format_type == 'json':
                self.export_as_json(articles_data, output_path)
            elif format_type == 'csv':
                self.export_as_csv(articles_data, output_path)
            else:
                self.stdout.write(self.style.ERROR('不支持的格式，仅支持 json 或 csv'))
                return
        self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
    def export_as_json(self, articles_data, output_path):
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(articles_data, f, ensure_ascii=False, indent=2)
    def export_as_csv(self, articles_data, output_path):
        if not articles_data:
            return
        # 打开CSV文件
        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for article_data in articles_data:
                # 将列表转换为字符串以便在CSV中存储
                article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
                writer.writerow(article_data)
    def export_with_media(self, articles_data, media_files, output_path, format_type):
        # 创建ZIP文件
        with zipfile.ZipFile(output_path, 'w') as zipf:
            # 添加文章数据文件
            data_filename = f'articles.{format_type}'
            if format_type == 'json':
                json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
                zipf.writestr(data_filename, json_data)
            elif format_type == 'csv':
                # 创建CSV内容
                if articles_data:
                    import io
                    csv_buffer = io.StringIO()
                    fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
                    writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
                    writer.writeheader()
                    for article_data in articles_data:
                        article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
                        writer.writerow(article_data)
                    zipf.writestr(data_filename, csv_buffer.getvalue())
            # 添加媒体文件
            for media_path in media_files:
                arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
                zipf.write(media_path, arcname)
--- a/core/models.py
+++ b/core/models.py
@@ -1,5 +1,6 @@
 from django.db import models
 class Website(models.Model):
    name = models.CharField(max_length=100, unique=True)
    base_url = models.URLField()
--- a/core/templates/core/article_detail.html
+++ b/core/templates/core/article_detail.html
@@ -1,17 +1,77 @@
 <!DOCTYPE html>
 <html lang="zh">
 <head>
-    <meta charset="UTF-8" />
+    <meta charset="UTF-8"/>
    <title>{{ article.title }}</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
        }
        .article-container {
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            padding: 30px;
            margin-bottom: 20px;
        }
        h1 {
            color: #2c3e50;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
            margin-top: 0;
        }
        .meta {
            color: #7f8c8d;
            font-size: 0.9em;
            margin-bottom: 20px;
        }
        hr {
            border: 0;
            height: 1px;
            background: #ecf0f1;
            margin: 20px 0;
        }
        .content {
            font-size: 16px;
        }
        .content img {
            max-width: 100%;
            height: auto;
            border-radius: 4px;
            margin: 10px 0;
        }
        .back-link {
            display: inline-block;
            padding: 10px 20px;
            background-color: #3498db;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            transition: background-color 0.3s;
        }
        .back-link:hover {
            background-color: #2980b9;
        }
    </style>
 </head>
 <body>
-    <h1>{{ article.title }}</h1>
+    <div class="article-container">
-    <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
+        <h1>{{ article.title }}</h1>
-    <hr />
+        <div class="meta">
-    <div>
+            <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
-        {{ article.content|safe }}
+        </div>
        <hr/>
        <div class="content">
            {{ article.content|safe }}
        </div>
        <hr/>
        <p><a href="{% url 'article_list' %}" class="back-link">← 返回列表</a></p>
    </div>
    <hr />
    <p><a href="{% url 'article_list' %}">返回列表</a></p>
 </body>
-</html>
+</html>
--- a/core/templates/core/article_list.html
+++ b/core/templates/core/article_list.html
@@ -1,33 +1,138 @@
 <!DOCTYPE html>
 <html lang="zh">
 <head>
-    <meta charset="UTF-8" />
+    <meta charset="UTF-8"/>
    <title>绿色课堂文章列表</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
        }
        .container {
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            padding: 30px;
            margin-bottom: 20px;
        }
        h1 {
            color: #2c3e50;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
            margin-top: 0;
        }
        .filters {
            margin-bottom: 20px;
            padding: 15px;
            background-color: #f1f8ff;
            border-radius: 5px;
        }
        .filters a {
            display: inline-block;
            padding: 5px 10px;
            margin: 0 5px 5px 0;
            background-color: #e1e8ed;
            color: #333;
            text-decoration: none;
            border-radius: 3px;
        }
        .filters a.active {
            background-color: #3498db;
            color: white;
        }
        ul {
            list-style: none;
            padding: 0;
        }
        li {
            padding: 10px 0;
            border-bottom: 1px solid #ecf0f1;
        }
        li:last-child {
            border-bottom: none;
        }
        a {
            color: #3498db;
            text-decoration: none;
        }
        a:hover {
            color: #2980b9;
            text-decoration: underline;
        }
        .meta {
            color: #7f8c8d;
            font-size: 0.9em;
        }
        .pagination {
            margin-top: 30px;
            text-align: center;
            padding: 20px 0;
        }
        .pagination a {
            display: inline-block;
            padding: 8px 16px;
            background-color: #3498db;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            margin: 0 5px;
        }
        .pagination a:hover {
            background-color: #2980b9;
        }
        .pagination span {
            margin: 0 10px;
            color: #7f8c8d;
        }
    </style>
 </head>
 <body>
-    <h1>绿色课堂文章列表</h1>
+    <div class="container">
        <h1>绿色课堂文章列表</h1>
-    <ul>
+        <div class="filters">
-        {% for article in page_obj %}
+            <strong>按网站筛选：</strong>
            <a href="{% url 'article_list' %}" {% if not selected_website %}class="active"{% endif %}>全部</a>
            {% for website in websites %}
                <a href="?website={{ website.id }}" {% if selected_website.id == website.id %}class="active"{% endif %}>{{ website.name }}</a>
            {% endfor %}
        </div>
        <ul>
            {% for article in page_obj %}
            <li>
                <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
-                ({{ article.created_at|date:"Y-m-d" }})
+                <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
            </li>
-        {% empty %}
+            {% empty %}
            <li>暂无文章</li>
-        {% endfor %}
+            {% endfor %}
-    </ul>
+        </ul>
-    <div class="pagination">
+        <div class="pagination">
-        {% if page_obj.has_previous %}
+            {% if page_obj.has_previous %}
-            <a href="?page={{ page_obj.previous_page_number }}">上一页</a>
+                {% if selected_website %}
-        {% endif %}
+                    <a href="?website={{ selected_website.id }}&page={{ page_obj.previous_page_number }}">上一页</a>
                {% else %}
                    <a href="?page={{ page_obj.previous_page_number }}">上一页</a>
                {% endif %}
            {% endif %}
-        <span>第 {{ page_obj.number }} 页，共 {{ page_obj.paginator.num_pages }} 页</span>
+            <span>第 {{ page_obj.number }} 页，共 {{ page_obj.paginator.num_pages }} 页</span>
-        {% if page_obj.has_next %}
+            {% if page_obj.has_next %}
-            <a href="?page={{ page_obj.next_page_number }}">下一页</a>
+                {% if selected_website %}
-        {% endif %}
+                    <a href="?website={{ selected_website.id }}&page={{ page_obj.next_page_number }}">下一页</a>
                {% else %}
                    <a href="?page={{ page_obj.next_page_number }}">下一页</a>
                {% endif %}
            {% endif %}
        </div>
    </div>
 </body>
-</html>
+</html>
--- a/core/utils.py
+++ b/core/utils.py
@@ -7,16 +7,44 @@ from collections import deque
 from django.utils import timezone
 from django.conf import settings
 from core.models import Article
 import re
 def download_media(url, save_dir):
    try:
-        resp = requests.get(url, timeout=15)
+        # 添加请求头以避免403 Forbidden错误
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Referer": urljoin(url, "/")
        }
        resp = requests.get(url, timeout=15, headers=headers)
        resp.raise_for_status()
    except Exception as e:
        print(f"下载失败：{url}，错误：{e}")
        return None
-    filename = url.split("/")[-1].split("?")[0]
+    # 更安全地处理文件名，去除查询参数并处理特殊字符
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    if not filename or '.' not in filename:
        # 如果URL路径中没有有效的文件名，使用默认名称
        filename = 'media_file'
    # 清理文件名中的特殊字符
    filename = re.sub(r'[^\w\-_\.]', '_', filename)
    # 确保文件有扩展名
    if '.' not in filename:
        content_type = resp.headers.get('content-type', '')
        if 'image/jpeg' in content_type:
            filename += '.jpg'
        elif 'image/png' in content_type:
            filename += '.png'
        elif 'image/gif' in content_type:
            filename += '.gif'
        else:
            filename += '.bin'  # 默认二进制扩展名
    os.makedirs(save_dir, exist_ok=True)
    filepath = os.path.join(save_dir, filename)
@@ -31,6 +59,7 @@ def download_media(url, save_dir):
        f.write(resp.content)
    return filepath
 def process_article(url, website):
    if Article.objects.filter(url=url).exists():
        print(f"文章已存在，跳过: {url}")
@@ -41,10 +70,36 @@ def process_article(url, website):
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, "html.parser")
-    title_tag = soup.find("span", class_="title")
+    # 处理不同网站的文章结构
-    title = title_tag.get_text(strip=True) if title_tag else "无标题"
+    if website.name == "www.news.cn":
        title_tag = soup.find("span", class_="title")
        content_tag = soup.find("span", id="detailContent")
    elif website.name == "东方烟草报":
        # 优化东方烟草报的标题提取逻辑，按优先级尝试多种选择器
        title_tag = (
            soup.find("h1", id="title") or  # 特别针对带id="title"的h1标签
            soup.find("h1") or  # 主要标题标签
            soup.find("title") or  # 页面title标签
            soup.find("div", class_="title") or  # 某些页面可能使用div.title
            soup.find("h2")  # 备选标题标签
        )
        content_tag = soup.find("div", class_="content")  # 东方烟草报的内容通常在div.content中
        # 增加对另一种内容结构的支持
        if not content_tag:
            content_tag = soup.find("div", id="gallery")
        # 再增加对新内容结构的支持
        if not content_tag:
            content_tag = soup.find("div", id="ContentText")
    else:
        # 默认处理方式
        title_tag = soup.find("h1") or soup.find("title")
        content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
    title = title_tag.get_text(strip=True) if title_tag else "无标题"
    # 对标题进行额外处理，去除可能的多余空白字符
    title = title.strip() if title else "无标题"
    content_tag = soup.find("span", id="detailContent")
    if not content_tag:
        print("没有找到正文，跳过:", url)
        return
@@ -80,6 +135,7 @@ def process_article(url, website):
    )
    print(f"已保存文章及图片：{title}")
 def is_valid_url(url, base_netloc):
    try:
        parsed = urlparse(url)
@@ -91,6 +147,7 @@ def is_valid_url(url, base_netloc):
    except Exception:
        return False
 def full_site_crawler(start_url, website, max_pages=1000):
    headers = {"User-Agent": "Mozilla/5.0"}
    visited = set()
@@ -117,8 +174,30 @@ def full_site_crawler(start_url, website, max_pages=1000):
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")
        # 根据不同网站判断文章页面
        is_article_page = False
        if website.name == "www.news.cn":
            is_article_page = soup.find("span", id="detailContent") is not None
        elif website.name == "东方烟草报":
            # 对于东方烟草报，我们增加基于URL模式的判断
            # 东方烟草报的文章URL通常包含/content/和日期格式
            parsed_url = urlparse(url)
            path = parsed_url.path
            is_article_page = (
                soup.find("div", class_="content") is not None or
                soup.find("div", id="gallery") is not None or
                soup.find("div", id="ContentText") is not None or
                ("/content/" in path and len(path) > 20)
            )
        else:
            # 默认判断逻辑
            is_article_page = (
                soup.find("div", class_="content") is not None or 
                soup.find("div", id="content") is not None
            )
        # 如果是文章页面，则调用文章处理
-        if soup.find("span", id="detailContent"):
+        if is_article_page:
            process_article(url, website)
            pages_crawled += 1
@@ -126,4 +205,4 @@ def full_site_crawler(start_url, website, max_pages=1000):
        for link in soup.find_all("a", href=True):
            href = urljoin(url, link["href"])
            if href not in visited and is_valid_url(href, base_netloc):
-                queue.append(href)
+                queue.append(href)
--- a/core/views.py
+++ b/core/views.py
@@ -2,20 +2,22 @@ from django.shortcuts import render, get_object_or_404
 from django.core.paginator import Paginator
 from .models import Article
 def article_list(request):
    """
    显示文章列表的视图函数
    """
    articles = Article.objects.all().order_by('-created_at')
    paginator = Paginator(articles, 20)  # 每页显示10篇文章
-    
+
    page_number = request.GET.get('page')
    page_obj = paginator.get_page(page_number)
-    
+
    return render(request, 'core/article_list.html', {
        'page_obj': page_obj
    })
 def article_detail(request, article_id):
    """
    显示文章详情的视图函数
@@ -24,5 +26,3 @@ def article_detail(request, article_id):
    return render(request, 'core/article_detail.html', {
        'article': article
    })
 # Create your views here.
--- a/green_classroom/urls.py
+++ b/green_classroom/urls.py
@@ -1,13 +1,18 @@
 from django.contrib import admin
 from django.urls import path, include
 from django.conf import settings
 from django.conf.urls.static import static
 from django.contrib import admin
 from django.urls import path, include
 # 需要导入自定义的管理站点实例
 from core.admin import news_cn_admin, dongfangyancao_admin
 urlpatterns = [
    path('admin/', admin.site.urls),
    path('news_cn_admin/', news_cn_admin.urls),
    path('dongfangyancao_admin/', dongfangyancao_admin.urls),
    # 以后前台访问放 core app 的 urls
    path('', include('core.urls')),
 ]
 if settings.DEBUG:
-    urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
+    urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)