fix bugs and support all platform

2025-08-15 08:33:47 +08:00
parent e82b85f4dd
commit 4945b4c6b0
36 changed files with 2296 additions and 992 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -180,5 +180,11 @@ cython_debug/
 #
 #####################################

+# 数据目录
+data/
 date/media/

+# 配置文件
+config/
+.env
+
--- a/core/admin.py
+++ b/core/admin.py
@@ -1,517 +0,0 @@
-from .models import Website, Article
-# 添加actions相关的导入
-from django.contrib import messages
-# 添加导出功能所需导入
-import csv
-from django.http import HttpResponse
-import json
-# 添加视图函数需要的导入
-from django.shortcuts import render, redirect
-from django.urls import path
-from django.contrib import admin
-from django.core.management import call_command
-
-# 添加运行爬虫的视图函数
-def run_crawler_view(request):
-    """
-    管理后台运行爬虫的视图
-    """
-    if request.method == 'POST':
-        website_name = request.POST.get('website_name')
-        if not website_name:
-            messages.error(request, '请选择要爬取的网站')
-            return redirect('admin:core_article_changelist')
-
-        try:
-            # 动态获取网站对象
-            website = Website.objects.get(name=website_name)
-            
-            # 根据网站对象确定要执行的爬虫命令
-            # 移除默认的通用爬虫，每个网站必须配置自己的爬虫命令
-            crawler_name = getattr(website, 'crawler_command', None)
-            
-            # 如果网站没有配置爬虫命令，则报错
-            if not crawler_name:
-                messages.error(request, f'网站 {website_name} 未配置爬虫命令')
-                return redirect('admin:core_article_changelist')
-
-            # 运行爬虫命令，传递网站名称
-            call_command(crawler_name, website_name)
-
-            messages.success(request, f'成功执行爬虫: {website_name}')
-        except Website.DoesNotExist:
-            messages.error(request, f'网站不存在: {website_name}')
-        except Exception as e:
-            messages.error(request, f'执行爬虫失败: {str(e)}')
-
-    return redirect('admin:core_article_changelist')
-
-
-@admin.register(Website)
-class WebsiteAdmin(admin.ModelAdmin):
-    list_display = ('name', 'base_url', 'enabled')
-
-
-# 为ArticleAdmin添加自定义动作
-@admin.register(Article)
-class ArticleAdmin(admin.ModelAdmin):
-    list_display = ('title', 'website', 'pub_date')
-    search_fields = ('title', 'content')
-    # 添加动作选项
-    actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
-               'export_as_word', 'export_with_media']
-
-    def get_websites(self):
-        """获取所有启用的网站"""
-        return Website.objects.filter(enabled=True)
-
-    # 重写get_urls方法，添加自定义URL
-    def get_urls(self):
-        urls = super().get_urls()
-        custom_urls = [
-            path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'),
-        ]
-        return custom_urls + urls
-
-    def export_as_csv(self, request, queryset):
-        """导出选中的文章为CSV格式"""
-        meta = self.model._meta
-        field_names = [field.name for field in meta.fields]
-
-        response = HttpResponse(content_type='text/csv')
-        response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
-        writer = csv.writer(response)
-
-        writer.writerow(field_names)
-        for obj in queryset:
-            row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
-                   field_names]
-            writer.writerow(row)
-
-        return response
-
-    export_as_csv.short_description = "导出选中文章为CSV格式"
-
-    def export_as_json(self, request, queryset):
-        """导出选中的文章为JSON格式"""
-        response = HttpResponse(content_type='application/json')
-        response['Content-Disposition'] = 'attachment; filename=articles.json'
-
-        # 构造要导出的数据
-        articles_data = []
-        for article in queryset:
-            articles_data.append({
-                'id': article.id,
-                'title': article.title,
-                'website': article.website.name,
-                'url': article.url,
-                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
-                'content': article.content,
-                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
-                'media_files': article.media_files
-            })
-
-        # 写入JSON数据
-        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
-        return response
-
-    export_as_json.short_description = "导出选中文章为JSON格式"
-
-    def export_as_word(self, request, queryset):
-        """导出选中的文章为Word格式"""
-        try:
-            from docx import Document
-            from io import BytesIO
-            from docx.shared import Inches
-        except ImportError:
-            self.message_user(request, "缺少python-docx库，请安装: pip install python-docx", messages.ERROR)
-            return
-
-        # 创建Word文档
-        doc = Document()
-        doc.add_heading('文章导出', 0)
-
-        for article in queryset:
-            # 添加文章标题
-            doc.add_heading(article.title, level=1)
-
-            # 添加文章元数据
-            doc.add_paragraph(f"网站: {article.website.name}")
-            doc.add_paragraph(f"URL: {article.url}")
-            doc.add_paragraph(
-                f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
-            doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
-
-            # 添加文章内容
-            doc.add_heading('内容', level=2)
-            # 简单处理HTML内容，移除标签并处理图片
-            from bs4 import BeautifulSoup
-            soup = BeautifulSoup(article.content, 'html.parser')
-
-            # 处理内容中的图片
-            for img in soup.find_all('img'):
-                src = img.get('src', '')
-                if src:
-                    # 尝试添加图片到文档
-                    try:
-                        import os
-                        from django.conf import settings
-                        import requests
-                        from io import BytesIO
-
-                        # 构建完整的图片路径
-                        if src.startswith('http'):
-                            # 网络图片
-                            response = requests.get(src, timeout=10)
-                            image_stream = BytesIO(response.content)
-                            doc.add_picture(image_stream, width=Inches(4.0))
-                        else:
-                            # 本地图片
-                            full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
-                            if os.path.exists(full_path):
-                                doc.add_picture(full_path, width=Inches(4.0))
-                    except Exception as e:
-                        # 如果添加图片失败，添加图片URL作为文本
-                        doc.add_paragraph(f"[图片: {src}]")
-
-                # 移除原始img标签
-                img.decompose()
-
-            content_text = soup.get_text()
-            doc.add_paragraph(content_text)
-
-            # 添加媒体文件信息
-            if article.media_files:
-                doc.add_heading('媒体文件', level=2)
-                for media_file in article.media_files:
-                    try:
-                        import os
-                        from django.conf import settings
-                        from io import BytesIO
-                        import requests
-
-                        full_path = os.path.join(settings.MEDIA_ROOT, media_file)
-                        if os.path.exists(full_path):
-                            # 添加图片到文档
-                            doc.add_picture(full_path, width=Inches(4.0))
-                        else:
-                            # 如果是URL格式的媒体文件
-                            if media_file.startswith('http'):
-                                response = requests.get(media_file, timeout=10)
-                                image_stream = BytesIO(response.content)
-                                doc.add_picture(image_stream, width=Inches(4.0))
-                            else:
-                                doc.add_paragraph(media_file)
-                    except Exception as e:
-                        doc.add_paragraph(media_file)
-
-            # 添加分页符
-            doc.add_page_break()
-
-        # 保存到内存
-        buffer = BytesIO()
-        doc.save(buffer)
-        buffer.seek(0)
-
-        # 创建HttpResponse
-        from django.http import HttpResponse
-        response = HttpResponse(buffer.getvalue(),
-                                content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
-        response['Content-Disposition'] = 'attachment; filename=articles.docx'
-        return response
-
-    export_as_word.short_description = "导出选中文章为Word格式"
-
-    def export_with_media(self, request, queryset):
-        """导出选中的文章及媒体文件为ZIP包"""
-        try:
-            from docx import Document
-            from io import BytesIO
-            from docx.shared import Inches
-            import zipfile
-        except ImportError:
-            self.message_user(request, "缺少必要库，请安装: pip install python-docx", messages.ERROR)
-            return
-
-        # 创建内存中的ZIP文件
-        zip_buffer = BytesIO()
-
-        with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
-            for article in queryset:
-                # 为每篇文章创建单独的文件夹
-                article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
-
-                # 创建Word文档
-                doc = Document()
-                doc.add_heading(article.title, 0)
-
-                # 添加文章元数据
-                doc.add_paragraph(f"网站: {article.website.name}")
-                doc.add_paragraph(f"URL: {article.url}")
-                doc.add_paragraph(
-                    f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
-                doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
-
-                # 添加文章内容
-                doc.add_heading('内容', level=2)
-                # 简单处理HTML内容，移除标签并处理图片
-                from bs4 import BeautifulSoup
-                soup = BeautifulSoup(article.content, 'html.parser')
-
-                # 处理内容中的图片
-                for img in soup.find_all('img'):
-                    src = img.get('src', '')
-                    if src:
-                        # 尝试添加图片到文档
-                        try:
-                            import os
-                            from django.conf import settings
-                            import requests
-
-                            # 构建完整的图片路径
-                            if src.startswith('http'):
-                                # 网络图片
-                                response = requests.get(src, timeout=10)
-                                image_stream = BytesIO(response.content)
-                                doc.add_picture(image_stream, width=Inches(4.0))
-                                # 将网络文件保存到ZIP
-                                zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)),
-                                                  response.content)
-                            else:
-                                # 本地图片
-                                full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
-                                if os.path.exists(full_path):
-                                    doc.add_picture(full_path, width=Inches(4.0))
-                                    # 添加文件到ZIP包
-                                    zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/')))
-                        except Exception as e:
-                            # 如果添加图片失败，添加图片URL作为文本
-                            doc.add_paragraph(f"[图片: {src}]")
-
-                    # 移除原始img标签
-                    img.decompose()
-
-                content_text = soup.get_text()
-                doc.add_paragraph(content_text)
-
-                # 添加媒体文件信息并打包媒体文件
-                if article.media_files:
-                    doc.add_heading('媒体文件', level=2)
-                    for media_file in article.media_files:
-                        try:
-                            import os
-                            from django.conf import settings
-
-                            full_path = os.path.join(settings.MEDIA_ROOT, media_file)
-                            # 检查文件扩展名以确定处理方式
-                            file_extension = os.path.splitext(media_file)[1].lower()
-
-                            # 图片文件处理
-                            if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
-                                if os.path.exists(full_path):
-                                    # 添加图片到文档
-                                    doc.add_picture(full_path, width=Inches(4.0))
-                                    # 添加文件到ZIP包
-                                    zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
-                                else:
-                                    # 如果是URL格式的媒体文件
-                                    if media_file.startswith('http'):
-                                        response = requests.get(media_file, timeout=10)
-                                        image_stream = BytesIO(response.content)
-                                        doc.add_picture(image_stream, width=Inches(4.0))
-                                        # 将网络文件保存到ZIP
-                                        zip_file.writestr(
-                                            os.path.join(article_folder, 'media', os.path.basename(media_file)),
-                                            response.content)
-                                    else:
-                                        doc.add_paragraph(media_file)
-                            # 视频文件处理
-                            elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
-                                # 视频文件只添加到ZIP包中，不在Word文档中显示
-                                if os.path.exists(full_path):
-                                    # 添加文件到ZIP包
-                                    zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
-                                    # 在Word文档中添加视频文件信息
-                                    doc.add_paragraph(f"[视频文件: {media_file}]")
-                                else:
-                                    # 如果是URL格式的媒体文件
-                                    if media_file.startswith('http'):
-                                        # 将网络文件保存到ZIP
-                                        response = requests.get(media_file, timeout=10)
-                                        zip_file.writestr(
-                                            os.path.join(article_folder, 'media', os.path.basename(media_file)),
-                                            response.content)
-                                        doc.add_paragraph(f"[视频文件: {media_file}]")
-                                    else:
-                                        doc.add_paragraph(media_file)
-                            # 其他文件类型
-                            else:
-                                if os.path.exists(full_path):
-                                    # 添加文件到ZIP包
-                                    zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
-                                    doc.add_paragraph(f"[文件: {media_file}]")
-                                else:
-                                    # 如果是URL格式的媒体文件
-                                    if media_file.startswith('http'):
-                                        response = requests.get(media_file, timeout=10)
-                                        zip_file.writestr(
-                                            os.path.join(article_folder, 'media', os.path.basename(media_file)),
-                                            response.content)
-                                        doc.add_paragraph(f"[文件: {media_file}]")
-                                    else:
-                                        doc.add_paragraph(media_file)
-                        except Exception as e:
-                            doc.add_paragraph(media_file)
-
-                # 保存每篇文章的Word文档到ZIP文件中的对应文件夹
-                doc_buffer = BytesIO()
-                doc.save(doc_buffer)
-                doc_buffer.seek(0)
-                zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
-                                  doc_buffer.read())
-
-        # 创建HttpResponse
-        zip_buffer.seek(0)
-        from django.http import HttpResponse
-        response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
-        response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
-        return response
-
-    export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)"
-
-
-# 为不同网站创建专门的文章管理类
-class NewsCnArticleAdmin(admin.ModelAdmin):
-    list_display = ('title', 'pub_date')
-    search_fields = ('title', 'content')
-    list_filter = ('pub_date',)
-    actions = ['export_as_csv', 'export_as_json']
-
-    def get_queryset(self, request):
-        qs = super().get_queryset(request)
-        # 只显示新华网的文章
-        return qs.filter(website__name='www.news.cn')
-
-    def export_as_csv(self, request, queryset):
-        """导出选中的文章为CSV格式"""
-        meta = self.model._meta
-        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
-
-        response = HttpResponse(content_type='text/csv')
-        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
-        writer = csv.writer(response)
-
-        writer.writerow(field_names)
-        for obj in queryset:
-            row = []
-            for field in field_names:
-                value = getattr(obj, field)
-                if callable(value):
-                    value = value()
-                if field == 'website':
-                    value = value.name
-                row.append(value)
-            writer.writerow(row)
-
-        return response
-
-    export_as_csv.short_description = "导出选中文章为CSV格式"
-
-    def export_as_json(self, request, queryset):
-        """导出选中的文章为JSON格式"""
-        response = HttpResponse(content_type='application/json')
-        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
-
-        # 构造要导出的数据
-        articles_data = []
-        for article in queryset:
-            articles_data.append({
-                'id': article.id,
-                'title': article.title,
-                'website': article.website.name,
-                'url': article.url,
-                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
-                'content': article.content,
-                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
-                'media_files': article.media_files
-            })
-
-        # 写入JSON数据
-        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
-        return response
-
-    export_as_json.short_description = "导出选中文章为JSON格式"
-
-
-class DongfangyancaoArticleAdmin(admin.ModelAdmin):
-    list_display = ('title', 'pub_date')
-    search_fields = ('title', 'content')
-    list_filter = ('pub_date',)
-    # 添加动作选项
-    actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
-
-    def get_queryset(self, request):
-        qs = super().get_queryset(request)
-        # 只显示东方烟草报的文章
-        return qs.filter(website__name='东方烟草报')
-
-    def delete_all_articles(self, request, queryset):
-        """删除当前筛选的所有文章（东方烟草报的所有文章）"""
-        # 删除所有东方烟草报的文章
-        deleted_count = self.get_queryset(request).delete()[0]
-        self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
-
-    # 设置动作的显示名称
-    delete_all_articles.short_description = "删除所有当前筛选的文章"
-
-    def export_as_csv(self, request, queryset):
-        """导出选中的文章为CSV格式"""
-        meta = self.model._meta
-        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
-
-        response = HttpResponse(content_type='text/csv')
-        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
-        writer = csv.writer(response)
-
-        writer.writerow(field_names)
-        for obj in queryset:
-            row = []
-            for field in field_names:
-                value = getattr(obj, field)
-                if callable(value):
-                    value = value()
-                if field == 'website':
-                    value = value.name
-                row.append(value)
-            writer.writerow(row)
-
-        return response
-
-    export_as_csv.short_description = "导出选中文章为CSV格式"
-
-    def export_as_json(self, request, queryset):
-        """导出选中的文章为JSON格式"""
-        response = HttpResponse(content_type='application/json')
-        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
-
-        # 构造要导出的数据
-        articles_data = []
-        for article in queryset:
-            articles_data.append({
-                'id': article.id,
-                'title': article.title,
-                'website': article.website.name,
-                'url': article.url,
-                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
-                'content': article.content,
-                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
-                'media_files': article.media_files
-            })
-
-        # 写入JSON数据
-        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
-        return response
-
-    export_as_json.short_description = "导出选中文章为JSON格式"
-
-# 在各自的管理站点中注册模型
--- a/core/admin_extended.py
+++ b/core/admin_extended.py
@@ -0,0 +1,384 @@
+"""
+Django Admin扩展
+提供增强的管理界面功能
+"""
+
+import logging
+from datetime import datetime, timedelta
+from django.contrib import admin
+from django.contrib.admin import SimpleListFilter
+from django.contrib.admin.utils import model_format_dict
+from django.contrib import messages
+from django.http import HttpResponseRedirect
+from django.urls import path, reverse
+from django.utils.html import format_html
+from django.utils import timezone
+from django.db.models import Count, Q
+from django.core.cache import cache
+
+from .models import Website, Article
+from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
+from .distributed_crawler import distributed_crawler
+
+logger = logging.getLogger(__name__)
+
+
+class WebsiteStatusFilter(SimpleListFilter):
+    """网站状态过滤器"""
+    title = '网站状态'
+    parameter_name = 'status'
+
+    def lookups(self, request, model_admin):
+        return (
+            ('enabled', '已启用'),
+            ('disabled', '已禁用'),
+            ('no_articles', '无文章'),
+            ('recent_crawl', '最近爬取'),
+        )
+
+    def queryset(self, request, queryset):
+        if self.value() == 'enabled':
+            return queryset.filter(enabled=True)
+        elif self.value() == 'disabled':
+            return queryset.filter(enabled=False)
+        elif self.value() == 'no_articles':
+            return queryset.annotate(article_count=Count('article')).filter(article_count=0)
+        elif self.value() == 'recent_crawl':
+            week_ago = timezone.now() - timedelta(days=7)
+            return queryset.filter(last_crawl__gte=week_ago)
+        return queryset
+
+
+class ArticleDateFilter(SimpleListFilter):
+    """文章日期过滤器"""
+    title = '发布时间'
+    parameter_name = 'date_range'
+
+    def lookups(self, request, model_admin):
+        return (
+            ('today', '今天'),
+            ('week', '本周'),
+            ('month', '本月'),
+            ('quarter', '本季度'),
+        )
+
+    def queryset(self, request, queryset):
+        now = timezone.now()
+        if self.value() == 'today':
+            return queryset.filter(created_at__date=now.date())
+        elif self.value() == 'week':
+            week_start = now - timedelta(days=now.weekday())
+            return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0))
+        elif self.value() == 'month':
+            return queryset.filter(created_at__year=now.year, created_at__month=now.month)
+        elif self.value() == 'quarter':
+            quarter = (now.month - 1) // 3
+            quarter_start_month = quarter * 3 + 1
+            return queryset.filter(
+                created_at__year=now.year,
+                created_at__month__gte=quarter_start_month,
+                created_at__month__lt=quarter_start_month + 3
+            )
+        return queryset
+
+
+class WebsiteAdmin(admin.ModelAdmin):
+    """网站管理"""
+    list_display = [
+        'name', 'base_url', 'enabled', 'article_count',
+        'last_crawl_display', 'status_indicator', 'actions_column'
+    ]
+    list_filter = [WebsiteStatusFilter, 'enabled']
+    search_fields = ['name', 'base_url']
+    readonly_fields = ['article_count']
+    actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all']
+
+    fieldsets = (
+        ('基本信息', {
+            'fields': ('name', 'base_url', 'enabled')
+        }),
+        ('统计信息', {
+            'fields': ('article_count',),
+            'classes': ('collapse',)
+        }),
+        ('时间信息', {
+            'fields': (),
+            'classes': ('collapse',)
+        }),
+    )
+
+    # 添加get_websites方法以支持模板中的网站选择
+    def get_websites(self, request):
+        """获取所有启用的网站，用于模板中的选择框"""
+        return Website.objects.filter(enabled=True)
+
+    def article_count(self, obj):
+        """文章数量"""
+        return obj.article_set.count()
+
+    article_count.short_description = '文章数量'
+
+    def last_crawl_display(self, obj):
+        """最后爬取时间显示"""
+        return '未实现'
+
+    last_crawl_display.short_description = '最后爬取'
+
+    def status_indicator(self, obj):
+        """状态指示器"""
+        if obj.enabled:
+            return format_html('<span style="color: green;">●</span> 正常')
+        else:
+            return format_html('<span style="color: red;">●</span> 禁用')
+
+    status_indicator.short_description = '状态'
+
+    def actions_column(self, obj):
+        """操作列"""
+        return format_html(
+            '<a href="{}" class="button">爬取</a> '
+            '<a href="{}" class="button">查看文章</a>',
+            reverse('admin:crawl_website', args=[obj.id]),
+            reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}'
+        )
+
+    actions_column.short_description = '操作'
+
+    def enable_websites(self, request, queryset):
+        """启用选中的网站"""
+        updated = queryset.update(enabled=True)
+        self.message_user(request, f'成功启用 {updated} 个网站')
+
+    enable_websites.short_description = '启用选中的网站'
+
+    def disable_websites(self, request, queryset):
+        """禁用选中的网站"""
+        updated = queryset.update(enabled=False)
+        self.message_user(request, f'成功禁用 {updated} 个网站')
+
+    disable_websites.short_description = '禁用选中的网站'
+
+    def crawl_selected(self, request, queryset):
+        """爬取选中的网站"""
+        for website in queryset:
+            try:
+                task = crawl_website.delay(website.id)
+                self.message_user(
+                    request,
+                    f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
+                    messages.SUCCESS
+                )
+            except Exception as e:
+                error_msg = str(e)
+                if "[Errno 61] Connection refused" in error_msg:
+                    detailed_msg = "连接被拒绝，可能是Redis或其他依赖服务未启动。请检查以下几点：\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker，请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中，可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
+                else:
+                    detailed_msg = error_msg
+                self.message_user(
+                    request,
+                    f'网站 {website.name} 爬取任务启动失败: {detailed_msg}',
+                    messages.ERROR
+                )
+
+    crawl_selected.short_description = '爬取选中的网站'
+
+    def crawl_all(self, request, queryset):
+        try:
+            task = crawl_all_websites.delay()
+            self.message_user(
+                request,
+                f'批量爬取任务已启动 (任务ID: {task.id})',
+                messages.SUCCESS
+            )
+        except Exception as e:
+            error_msg = str(e)
+            if "[Errno 61] Connection refused" in error_msg:
+                detailed_msg = "连接被拒绝，可能是Redis或其他依赖服务未启动。请检查以下几点：\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker，请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中，可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
+            else:
+                detailed_msg = error_msg
+            self.message_user(
+                request,
+                f'批量爬取任务启动失败: {detailed_msg}',
+                messages.ERROR
+            )
+
+    #    crawl_all.short_description = '爬取所有网站'
+
+    def get_urls(self):
+        """添加自定义URL"""
+        urls = super().get_urls()
+        custom_urls = [
+            path(
+                '<int:website_id>/crawl/',
+                self.admin_site.admin_view(self.crawl_website_view),
+                name='crawl_website',
+            ),
+            path(
+                'run-crawler/',
+                self.admin_site.admin_view(self.run_crawler_view),
+                name='run_crawler',
+            ),
+        ]
+        return custom_urls + urls
+
+    def crawl_website_view(self, request, website_id):
+        """爬取单个网站视图"""
+        try:
+            website = Website.objects.get(id=website_id)
+            task = crawl_website.delay(website_id)
+            self.message_user(
+                request,
+                f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
+                messages.SUCCESS
+            )
+        except Website.DoesNotExist:
+            self.message_user(request, '网站不存在', messages.ERROR)
+        except Exception as e:
+            error_msg = str(e)
+            if "[Errno 61] Connection refused" in error_msg:
+                detailed_msg = "连接被拒绝，可能是Redis或其他依赖服务未启动。请检查以下几点：\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker，请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中，可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
+            else:
+                detailed_msg = error_msg
+            self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR)
+
+        return HttpResponseRedirect(reverse('admin:core_website_changelist'))
+
+    def run_crawler_view(self, request):
+        """运行爬虫视图"""
+        try:
+            task = crawl_all_websites.delay()
+            self.message_user(
+                request,
+                f'批量爬取任务已启动 (任务ID: {task.id})',
+                messages.SUCCESS
+            )
+        except Exception as e:
+            error_msg = str(e)
+            if "[Errno 61] Connection refused" in error_msg:
+                detailed_msg = "连接被拒绝，可能是Redis或其他依赖服务未启动。请检查以下几点：\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker，请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中，可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
+            else:
+                detailed_msg = error_msg
+            self.message_user(
+                request,
+                f'批量爬取任务启动失败: {detailed_msg}',
+                messages.ERROR
+            )
+
+        return HttpResponseRedirect(reverse('admin:core_website_changelist'))
+
+
+class ArticleAdmin(admin.ModelAdmin):
+    """文章管理"""
+    list_display = [
+        'title', 'website', 'created_at',
+        'media_count', 'actions_column'
+    ]
+    list_filter = [
+        ArticleDateFilter, 'website', 'created_at'
+    ]
+    search_fields = ['title', 'content', 'url']
+    readonly_fields = ['created_at', 'media_files_display']
+    date_hierarchy = 'created_at'
+
+    fieldsets = (
+        ('基本信息', {
+            'fields': ('title', 'url', 'website')
+        }),
+        ('内容', {
+            'fields': ('content',)
+        }),
+        ('媒体文件', {
+            'fields': ('media_files_display',),
+            'classes': ('collapse',)
+        }),
+        ('时间信息', {
+            'fields': ('created_at',),
+            'classes': ('collapse',)
+        }),
+    )
+
+    def content_preview(self, obj):
+        """内容预览"""
+        return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
+
+    content_preview.short_description = '内容预览'
+
+    def media_count(self, obj):
+        """媒体文件数量"""
+        if obj.media_files:
+            return len(obj.media_files)
+        return 0
+
+    media_count.short_description = '媒体文件'
+
+    def media_files_display(self, obj):
+        """媒体文件显示"""
+        if not obj.media_files:
+            return '无媒体文件'
+
+        html = '<div style="max-height: 300px; overflow-y: auto;">'
+        for i, media in enumerate(obj.media_files):
+            if media.get('type') == 'image':
+                html += f'<div style="margin: 10px 0;"><img src="{media["url"]}" style="max-width: 200px; max-height: 150px;" /></div>'
+            elif media.get('type') == 'video':
+                html += f'<div style="margin: 10px 0;"><video controls style="max-width: 200px;"><source src="{media["url"]}" type="video/mp4"></video></div>'
+        html += '</div>'
+        return format_html(html)
+
+    media_files_display.short_description = '媒体文件'
+
+    def actions_column(self, obj):
+        """操作列"""
+        # 修改: 添加跳转到本地文章详情页的链接
+        return format_html(
+            '<a href="{}" target="_blank" class="button">查看原文</a> '
+            '<a href="{}" target="_blank" class="button">本地查看</a>',
+            obj.url,
+            reverse('article_detail', args=[obj.id])
+        )
+
+    actions_column.short_description = '操作'
+
+
+class CrawlerStatusAdmin(admin.ModelAdmin):
+    """爬虫状态管理"""
+    change_list_template = 'admin/crawler_status.html'
+
+    def changelist_view(self, request, extra_context=None):
+        """爬虫状态视图"""
+        # 获取分布式爬虫状态
+        nodes = distributed_crawler.get_available_nodes()
+        node_statuses = []
+
+        for node_id in nodes:
+            status = distributed_crawler.get_node_status(node_id)
+            node_statuses.append(status)
+
+        # 获取最近的批次
+        batches = distributed_crawler.get_all_batches()[:10]
+
+        # 获取任务统计
+        task_stats = {
+            'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
+            'total_nodes': len(nodes),
+            'total_batches': len(batches),
+        }
+
+        extra_context = extra_context or {}
+        extra_context.update({
+            'nodes': node_statuses,
+            'batches': batches,
+            'task_stats': task_stats,
+        })
+
+        return super().changelist_view(request, extra_context)
+
+
+# 注册管理类
+admin.site.register(Website, WebsiteAdmin)
+admin.site.register(Article, ArticleAdmin)
+
+# 自定义管理站点标题
+admin.site.site_header = 'Green Classroom 管理系统'
+admin.site.site_title = 'Green Classroom'
+admin.site.index_title = '欢迎使用 Green Classroom 管理系统'
--- a/core/api.py
+++ b/core/api.py
@@ -0,0 +1,746 @@
+"""
+RESTful API模块
+提供完整的API接口，支持爬虫管理、数据查询、任务控制
+"""
+
+import logging
+from datetime import datetime, timedelta
+from typing import Dict, List, Any
+import json
+import csv
+import io
+import zipfile
+
+from django.http import JsonResponse, HttpResponse
+from django.views.decorators.csrf import csrf_exempt
+from django.views.decorators.http import require_http_methods
+from django.core.paginator import Paginator
+from django.db.models import Q, Count
+from django.utils import timezone
+# 添加DRF相关导入
+from rest_framework.views import APIView
+from rest_framework.response import Response
+from rest_framework.permissions import IsAuthenticated
+from rest_framework.authentication import SessionAuthentication, TokenAuthentication
+
+# 添加python-docx库支持
+from docx import Document
+
+# 添加BeautifulSoup导入
+from bs4 import BeautifulSoup
+
+from .models import Website, Article
+from .tasks import crawl_website, cleanup_old_articles
+from .distributed_crawler import distributed_crawler
+
+logger = logging.getLogger(__name__)
+
+
+def api_response(data=None, message="", status=200, error=None):
+    """统一的API响应格式"""
+    response = {
+        "success": status < 400,
+        "message": message,
+        "timestamp": datetime.now().isoformat(),
+    }
+
+    if data is not None:
+        response["data"] = data
+
+    if error:
+        response["error"] = error
+
+    # 如果是DRF视图，则返回DRF Response
+    if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response:
+        return Response(response, status=status)
+
+    return JsonResponse(response, status=status)
+
+
+# 修改健康检查接口为DRF类视图
+class HealthView(APIView):
+    """健康检查接口"""
+    permission_classes = []  # 允许无认证访问
+    authentication_classes = []
+
+    def get(self, request):
+        try:
+            # 检查数据库连接
+            website_count = Website.objects.count()
+            article_count = Article.objects.count()
+
+            # 检查Redis连接
+            from django.core.cache import cache
+            cache.set('health_check', 'ok', 60)
+            cache_result = cache.get('health_check')
+
+            health_data = {
+                "status": "healthy",
+                "database": "ok",
+                "redis": "ok" if cache_result == 'ok' else 'error',
+                "website_count": website_count,
+                "article_count": article_count,
+                "uptime": "running"
+            }
+
+            # 设置使用DRF响应
+            api_response._use_drf_response = True
+            return api_response(data=health_data, message="服务运行正常")
+
+        except Exception as e:
+            logger.error(f"健康检查失败: {e}")
+            return api_response(
+                data={"status": "unhealthy", "error": str(e)},
+                message="服务异常",
+                status=500,
+                error=str(e)
+            )
+        finally:
+            api_response._use_drf_response = False
+
+
+# 修改网站列表接口为DRF类视图
+class WebsitesView(APIView):
+    """获取网站列表"""
+    permission_classes = [IsAuthenticated]
+    authentication_classes = [SessionAuthentication, TokenAuthentication]
+
+    def get(self, request):
+        try:
+            # 分页参数
+            page = int(request.GET.get('page', 1))
+            page_size = int(request.GET.get('page_size', 20))
+            search = request.GET.get('search', '')
+            enabled = request.GET.get('enabled', '')
+
+            # 构建查询
+            queryset = Website.objects.all()
+
+            if search:
+                queryset = queryset.filter(
+                    Q(name__icontains=search) |
+                    Q(base_url__icontains=search)
+                )
+
+            if enabled in ['true', 'false']:
+                queryset = queryset.filter(enabled=enabled == 'true')
+
+            # 排序 - 使用id字段替代不存在的created_at字段
+            queryset = queryset.order_by('-id')
+
+            # 分页
+            paginator = Paginator(queryset, page_size)
+            websites_page = paginator.get_page(page)
+
+            # 统计数据
+            stats = {
+                'total_websites': Website.objects.count(),
+                'enabled_websites': Website.objects.filter(enabled=True).count(),
+                'disabled_websites': Website.objects.filter(enabled=False).count(),
+            }
+
+            # 序列化数据
+            websites_data = []
+            for website in websites_page:
+                website_data = {
+                    'id': website.id,
+                    'name': website.name,
+                    'base_url': website.base_url,
+                    'enabled': website.enabled,
+                    # 移除不存在的created_at和updated_at字段
+                    'article_count': website.article_set.count(),
+                    'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
+                }
+                websites_data.append(website_data)
+
+            response_data = {
+                'websites': websites_data,
+                'pagination': {
+                    'page': page,
+                    'page_size': page_size,
+                    'total_pages': paginator.num_pages,
+                    'total_count': paginator.count,
+                    'has_next': websites_page.has_next(),
+                    'has_previous': websites_page.has_previous(),
+                },
+                'stats': stats
+            }
+
+            # 设置使用DRF响应
+            api_response._use_drf_response = True
+            return api_response(data=response_data, message="获取网站列表成功")
+
+        except Exception as e:
+            logger.error(f"获取网站列表失败: {e}")
+            return api_response(message="获取网站列表失败", status=500, error=str(e))
+        finally:
+            api_response._use_drf_response = False
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_website_detail(request, website_id):
+    """获取网站详情"""
+    try:
+        website = Website.objects.get(id=website_id)
+
+        # 获取最近的文章
+        recent_articles = website.article_set.order_by('-created_at')[:10]
+
+        website_data = {
+            'id': website.id,
+            'name': website.name,
+            'base_url': website.base_url,
+            'enabled': website.enabled,
+            'created_at': website.created_at.isoformat(),
+            'updated_at': website.updated_at.isoformat(),
+            'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None,
+            'article_count': website.article_set.count(),
+            'recent_articles': [
+                {
+                    'id': article.id,
+                    'title': article.title,
+                    'url': article.url,
+                    'created_at': article.created_at.isoformat(),
+                }
+                for article in recent_articles
+            ]
+        }
+
+        return api_response(data=website_data, message="获取网站详情成功")
+
+    except Website.DoesNotExist:
+        return api_response(message="网站不存在", status=404, error="Website not found")
+    except Exception as e:
+        logger.error(f"获取网站详情失败: {e}")
+        return api_response(message="获取网站详情失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["POST"])
+def api_crawl_website(request, website_id):
+    """爬取指定网站"""
+    try:
+        website = Website.objects.get(id=website_id)
+
+        # 启动爬虫任务
+        task = crawl_website.delay(website_id)
+
+        response_data = {
+            'task_id': task.id,
+            'website_id': website_id,
+            'website_name': website.name,
+            'status': 'started'
+        }
+
+        return api_response(data=response_data, message="爬虫任务已启动")
+
+    except Website.DoesNotExist:
+        return api_response(message="网站不存在", status=404, error="Website not found")
+    except Exception as e:
+        logger.error(f"启动爬虫任务失败: {e}")
+        return api_response(message="启动爬虫任务失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_articles(request):
+    """获取文章列表"""
+    try:
+        # 分页参数
+        page = int(request.GET.get('page', 1))
+        page_size = int(request.GET.get('page_size', 20))
+        search = request.GET.get('search', '')
+        website_id = request.GET.get('website_id', '')
+        date_from = request.GET.get('date_from', '')
+        date_to = request.GET.get('date_to', '')
+
+        # 构建查询
+        queryset = Article.objects.select_related('website').all()
+
+        if search:
+            queryset = queryset.filter(
+                Q(title__icontains=search) |
+                Q(content__icontains=search)
+            )
+
+        if website_id:
+            queryset = queryset.filter(website_id=website_id)
+
+        if date_from:
+            try:
+                date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00'))
+                queryset = queryset.filter(created_at__gte=date_from_obj)
+            except ValueError:
+                pass
+
+        if date_to:
+            try:
+                date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00'))
+                queryset = queryset.filter(created_at__lte=date_to_obj)
+            except ValueError:
+                pass
+
+        # 排序
+        queryset = queryset.order_by('-created_at')
+
+        # 分页
+        paginator = Paginator(queryset, page_size)
+        articles_page = paginator.get_page(page)
+
+        # 统计数据
+        stats = {
+            'total_articles': Article.objects.count(),
+            'today_articles': Article.objects.filter(
+                created_at__date=timezone.now().date()
+            ).count(),
+            'week_articles': Article.objects.filter(
+                created_at__gte=timezone.now() - timedelta(days=7)
+            ).count(),
+        }
+
+        # 序列化数据
+        articles_data = []
+        for article in articles_page:
+            article_data = {
+                'id': article.id,
+                'title': article.title,
+                'url': article.url,
+                'content': article.content[:200] + '...' if len(article.content) > 200 else article.content,
+                'created_at': article.created_at.isoformat(),
+                'website': {
+                    'id': article.website.id,
+                    'name': article.website.name,
+                },
+                'media_files': article.media_files,
+            }
+            articles_data.append(article_data)
+
+        response_data = {
+            'articles': articles_data,
+            'pagination': {
+                'page': page,
+                'page_size': page_size,
+                'total_pages': paginator.num_pages,
+                'total_count': paginator.count,
+                'has_next': articles_page.has_next(),
+                'has_previous': articles_page.has_previous(),
+            },
+            'stats': stats
+        }
+
+        return api_response(data=response_data, message="获取文章列表成功")
+
+    except Exception as e:
+        logger.error(f"获取文章列表失败: {e}")
+        return api_response(message="获取文章列表失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_article_detail(request, article_id):
+    """获取文章详情"""
+    try:
+        article = Article.objects.select_related('website').get(id=article_id)
+
+        article_data = {
+            'id': article.id,
+            'title': article.title,
+            'url': article.url,
+            'content': article.content,
+            'created_at': article.created_at.isoformat(),
+            'website': {
+                'id': article.website.id,
+                'name': article.website.name,
+                'base_url': article.website.base_url,
+            },
+            'media_files': article.media_files,
+        }
+
+        return api_response(data=article_data, message="获取文章详情成功")
+
+    except Article.DoesNotExist:
+        return api_response(message="文章不存在", status=404, error="Article not found")
+    except Exception as e:
+        logger.error(f"获取文章详情失败: {e}")
+        return api_response(message="获取文章详情失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_crawler_status(request):
+    """获取爬虫状态"""
+    try:
+        # 获取分布式爬虫状态
+        nodes = distributed_crawler.get_available_nodes()
+        node_statuses = []
+
+        for node_id in nodes:
+            status = distributed_crawler.get_node_status(node_id)
+            node_statuses.append(status)
+
+        # 获取最近的批次
+        batches = distributed_crawler.get_all_batches()[:10]
+
+        # 获取任务统计
+        task_stats = {
+            'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
+            'total_nodes': len(nodes),
+            'total_batches': len(batches),
+        }
+
+        response_data = {
+            'nodes': node_statuses,
+            'batches': batches,
+            'stats': task_stats,
+        }
+
+        return api_response(data=response_data, message="获取爬虫状态成功")
+
+    except Exception as e:
+        logger.error(f"获取爬虫状态失败: {e}")
+        return api_response(message="获取爬虫状态失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["POST"])
+def api_start_distributed_crawl(request):
+    """启动分布式爬取"""
+    try:
+        data = json.loads(request.body)
+        website_ids = data.get('website_ids', [])
+
+        if not website_ids:
+            return api_response(message="请选择要爬取的网站", status=400, error="No websites selected")
+
+        # 启动分布式爬取
+        batch_id = distributed_crawler.distribute_crawl_tasks(website_ids)
+
+        if batch_id in ['no_websites', 'no_available_nodes']:
+            return api_response(message="无法启动分布式爬取", status=400, error=batch_id)
+
+        response_data = {
+            'batch_id': batch_id,
+            'website_ids': website_ids,
+            'status': 'started'
+        }
+
+        return api_response(data=response_data, message="分布式爬取已启动")
+
+    except json.JSONDecodeError:
+        return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
+    except Exception as e:
+        logger.error(f"启动分布式爬取失败: {e}")
+        return api_response(message="启动分布式爬取失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_batch_status(request, batch_id):
+    """获取批次状态"""
+    try:
+        batch_status = distributed_crawler.get_batch_status(batch_id)
+
+        if batch_status.get('status') == 'not_found':
+            return api_response(message="批次不存在", status=404, error="Batch not found")
+
+        return api_response(data=batch_status, message="获取批次状态成功")
+
+    except Exception as e:
+        logger.error(f"获取批次状态失败: {e}")
+        return api_response(message="获取批次状态失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET", "POST"])
+def api_cleanup_articles(request):
+    """清理旧文章"""
+    # 如果是GET请求，返回清理功能的描述信息
+    if request.method == "GET":
+        response_data = {
+            'description': '文章清理API',
+            'method': 'POST',
+            'parameters': {
+                'days': '保留天数，默认30天'
+            },
+            'example': {
+                'days': 30
+            }
+        }
+        return api_response(data=response_data, message="API使用说明")
+
+    try:
+        data = json.loads(request.body)
+        days = data.get('days', 30)
+
+        # 启动清理任务
+        task = cleanup_old_articles.delay(days)
+
+        response_data = {
+            'task_id': task.id,
+            'days': days,
+            'status': 'started'
+        }
+
+        return api_response(data=response_data, message="清理任务已启动")
+
+    except json.JSONDecodeError:
+        return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
+    except Exception as e:
+        logger.error(f"启动清理任务失败: {e}")
+        return api_response(message="启动清理任务失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["GET"])
+def api_stats(request):
+    """获取统计信息"""
+    try:
+        # 基础统计
+        total_websites = Website.objects.count()
+        total_articles = Article.objects.count()
+        enabled_websites = Website.objects.filter(enabled=True).count()
+
+        # 时间统计
+        today = timezone.now().date()
+        week_ago = timezone.now() - timedelta(days=7)
+        month_ago = timezone.now() - timedelta(days=30)
+
+        today_articles = Article.objects.filter(created_at__date=today).count()
+        week_articles = Article.objects.filter(created_at__gte=week_ago).count()
+        month_articles = Article.objects.filter(created_at__gte=month_ago).count()
+
+        # 网站统计
+        website_stats = []
+        for website in Website.objects.all():
+            website_stats.append({
+                'id': website.id,
+                'name': website.name,
+                'article_count': website.article_set.count(),
+                # 使用getattr安全访问last_crawl属性，如果不存在则返回None
+                'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
+            })
+
+        # 分布式爬虫统计
+        nodes = distributed_crawler.get_available_nodes()
+        batches = distributed_crawler.get_all_batches()
+
+        response_data = {
+            'overview': {
+                'total_websites': total_websites,
+                'enabled_websites': enabled_websites,
+                'total_articles': total_articles,
+                'today_articles': today_articles,
+                'week_articles': week_articles,
+                'month_articles': month_articles,
+            },
+            'websites': website_stats,
+            'crawler': {
+                'active_nodes': len(nodes),
+                'total_batches': len(batches),
+                'recent_batches': batches[:5],
+            }
+        }
+
+        return api_response(data=response_data, message="获取统计信息成功")
+
+    except Exception as e:
+        logger.error(f"获取统计信息失败: {e}")
+        return api_response(message="获取统计信息失败", status=500, error=str(e))
+
+
+@csrf_exempt
+@require_http_methods(["POST"])
+def export_articles(request):
+    """导出文章"""
+    try:
+        data = json.loads(request.body)
+        article_ids = data.get('article_ids', [])
+        export_format = data.get('format', 'docx')  # 默认改为docx格式
+
+        if not article_ids:
+            return api_response(message="请选择要导出的文章", status=400, error="No articles selected")
+
+        # 获取文章数据
+        articles = Article.objects.filter(id__in=article_ids).select_related('website')
+
+        if not articles.exists():
+            return api_response(message="未找到指定的文章", status=404, error="Articles not found")
+
+        import os  # 添加导入
+        from django.conf import settings  # 添加导入
+
+        if export_format == 'json':
+            # 导出为JSON格式
+            articles_data = []
+            for article in articles:
+                articles_data.append({
+                    'id': article.id,
+                    'title': article.title,
+                    'url': article.url,
+                    'content': article.content,
+                    'created_at': article.created_at.isoformat(),
+                    'website': {
+                        'id': article.website.id,
+                        'name': article.website.name,
+                    },
+                    'media_files': article.media_files,
+                })
+
+            response = HttpResponse(
+                json.dumps(articles_data, ensure_ascii=False, indent=2),
+                content_type='application/json'
+            )
+            response['Content-Disposition'] = 'attachment; filename="articles.json"'
+            return response
+
+        elif export_format == 'csv':
+            # 导出为CSV格式
+            output = io.StringIO()
+            writer = csv.writer(output)
+            writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站'])
+
+            for article in articles:
+                writer.writerow([
+                    article.id,
+                    article.title,
+                    article.url,
+                    article.content[:1000] + '...' if len(article.content) > 1000 else article.content,
+                    article.created_at.isoformat(),
+                    article.website.name
+                ])
+
+            response = HttpResponse(output.getvalue(), content_type='text/csv')
+            response['Content-Disposition'] = 'attachment; filename="articles.csv"'
+            return response
+
+        elif export_format == 'docx':
+            # 导出为Word格式，每个文章一个文件夹
+            zip_buffer = io.BytesIO()
+            with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                for article in articles:
+                    # 创建文章文件夹名称
+                    safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
+                    folder_name = f"article_{article.id}_{safe_title}"[:50]
+                    
+                    # 创建Word文档
+                    doc = Document()
+                    doc.add_heading(article.title, 0)
+                    
+                    # 添加文章信息
+                    doc.add_paragraph(f"网站: {article.website.name}")
+                    doc.add_paragraph(f"网址: {article.url}")
+                    doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
+                    doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
+                    
+                    # 添加内容标题
+                    doc.add_heading('内容:', level=1)
+                    
+                    # 处理HTML内容
+                    content_text = BeautifulSoup(article.content, 'html.parser').get_text()
+                    doc.add_paragraph(content_text)
+                    
+                    # 将文档保存到内存中
+                    doc_buffer = io.BytesIO()
+                    doc.save(doc_buffer)
+                    doc_buffer.seek(0)
+                    
+                    # 添加到ZIP文件
+                    zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
+                    
+                    # 添加媒体文件（如果存在）
+                    if article.media_files:
+                        for media in article.media_files:
+                            try:
+                                # 如果是本地文件路径
+                                if not media.startswith('http'):
+                                    media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
+                                    if os.path.exists(media_path):
+                                        zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
+                                # 如果是URL格式的媒体文件
+                                else:
+                                    import requests
+                                    from io import BytesIO
+                                    
+                                    response = requests.get(media, timeout=10)
+                                    if response.status_code == 200:
+                                        image_stream = BytesIO(response.content)
+                                        media_filename = f"{folder_name}/media/{os.path.basename(media)}"
+                                        zip_file.writestr(media_filename, image_stream.getvalue())
+                            except Exception:
+                                # 忽略无法添加的媒体文件
+                                pass
+                
+            response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
+            response['Content-Disposition'] = 'attachment; filename="articles.zip"'
+            return response
+
+        elif export_format == 'zip':
+            # 导出为ZIP包，每个文章一个文件夹
+            zip_buffer = io.BytesIO()
+            with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                for article in articles:
+                    # 创建文章文件夹名称
+                    safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
+                    folder_name = f"article_{article.id}_{safe_title}"[:50]
+                    
+                    # 创建Word文档
+                    doc = Document()
+                    doc.add_heading(article.title, 0)
+                    
+                    # 添加文章信息
+                    doc.add_paragraph(f"网站: {article.website.name}")
+                    doc.add_paragraph(f"网址: {article.url}")
+                    doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
+                    doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
+                    
+                    # 添加内容标题
+                    doc.add_heading('内容:', level=1)
+                    
+                    # 处理HTML内容
+                    content_text = BeautifulSoup(article.content, 'html.parser').get_text()
+                    doc.add_paragraph(content_text)
+                    
+                    # 将文档保存到内存中
+                    doc_buffer = io.BytesIO()
+                    doc.save(doc_buffer)
+                    doc_buffer.seek(0)
+                    
+                    # 添加到ZIP文件
+                    zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
+                    
+                    # 添加媒体文件（如果存在）
+                    if article.media_files:
+                        for media in article.media_files:
+                            try:
+                                # 如果是本地文件路径
+                                if not media.startswith('http'):
+                                    media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
+                                    if os.path.exists(media_path):
+                                        zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
+                                # 如果是URL格式的媒体文件
+                                else:
+                                    import requests
+                                    from io import BytesIO
+                                    
+                                    response = requests.get(media, timeout=10)
+                                    if response.status_code == 200:
+                                        image_stream = BytesIO(response.content)
+                                        media_filename = f"{folder_name}/media/{os.path.basename(media)}"
+                                        zip_file.writestr(media_filename, image_stream.getvalue())
+                            except Exception:
+                                # 忽略无法添加的媒体文件
+                                pass
+                
+            response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
+            response['Content-Disposition'] = 'attachment; filename="articles.zip"'
+            return response
+
+        else:
+            return api_response(message="不支持的导出格式", status=400, error="Unsupported format")
+
+    except json.JSONDecodeError:
+        return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
+    except Exception as e:
+        logger.error(f"导出文章失败: {e}")
+        return api_response(message="导出文章失败", status=500, error=str(e))
--- a/core/apps.py
+++ b/core/apps.py
@@ -4,3 +4,8 @@ from django.apps import AppConfig
 class CoreConfig(AppConfig):
    default_auto_field = 'django.db.models.BigAutoField'
    name = 'core'
+
+    def ready(self):
+        """应用启动时执行"""
+        # 导入Admin扩展
+        import core.admin_extended
--- a/core/management/commands/crawl_all_media.py
+++ b/core/management/commands/crawl_all_media.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):
    def add_arguments(self, parser):
        parser.add_argument('--media', type=str, help='指定要爬取的媒体，用逗号分隔')
        parser.add_argument('--platform', type=str, default='all',
-                            help='指定平台类型: all(全部), web(网站), mobile(移动端)')
+                            help='指定平台类型: all(全部), web(网站)')

    def handle(self, *args, **options):
        media_list = options['media']
--- a/core/management/commands/crawl_cctv.py
+++ b/core/management/commands/crawl_cctv.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['cctv', 'cctvnews', 'mobile', 'all'],
+                            choices=['cctv', 'cctvnews', 'all'],
                            help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_china.py
+++ b/core/management/commands/crawl_china.py
@@ -3,13 +3,12 @@ from core.models import Website
 from core.utils import full_site_crawler


-# jimmy.fang-20250815: 因URL问题，移除中国网-省份
 class Command(BaseCommand):
    help = "全站递归爬取 中国网主网及中国网一省份，不转发二级子网站"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['china', 'province', 'all'],
+                            choices=['china', 'all'],
                            help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')

    def handle(self, *args, **options):
@@ -23,12 +22,7 @@ class Command(BaseCommand):
                'start_url': 'http://www.china.com.cn',
                'article_selector': 'a'
            },
-            # 'province': {
-            #     'name': '中国网一省份',
-            #     'base_url': 'http://www.china.com.cn',
-            #     'start_url': 'http://www.china.com.cn/province',
-            #     'article_selector': 'a'
-            # }
+
        }

        if platform == 'all':
--- a/core/management/commands/crawl_chinanews.py
+++ b/core/management/commands/crawl_chinanews.py
@@ -8,7 +8,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['chinanews', 'mobile', 'all'],
+                            choices=['chinanews', 'all'],
                            help='选择爬取平台: chinanews(中国新闻社), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_cngov.py
+++ b/core/management/commands/crawl_cngov.py
@@ -50,4 +50,4 @@ class Command(BaseCommand):
            full_site_crawler(platform_config['start_url'], website, max_pages=500)
            self.stdout.write(f"完成爬取: {platform_config['name']}")

-        self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
+        self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
--- a/core/management/commands/crawl_dongfangyancao.py
+++ b/core/management/commands/crawl_dongfangyancao.py
@@ -50,4 +50,4 @@ class Command(BaseCommand):
            full_site_crawler(platform_config['start_url'], website, max_pages=500)
            self.stdout.write(f"完成爬取: {platform_config['name']}")

-        self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
+        self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
--- a/core/management/commands/crawl_fzrb.py
+++ b/core/management/commands/crawl_fzrb.py
@@ -8,7 +8,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['fzrb', 'mobile', 'all'],
+                            choices=['fzrb', 'all'],
                            help='选择爬取平台: fzrb(法治日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_gmrb.py
+++ b/core/management/commands/crawl_gmrb.py
@@ -2,13 +2,14 @@ from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import full_site_crawler

-# jimmy.fang-20250815: 光明日报反爬，会被阻挡
+
+# jimmy.fang-20250815: 取消对光明日报的支持，光明日报反爬，被阻挡
 class Command(BaseCommand):
    help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['gmrb', 'mobile', 'all'],
+                            choices=['gmrb', 'all'],
                            help='选择爬取平台: gmrb(光明日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_grrb.py
+++ b/core/management/commands/crawl_grrb.py
@@ -8,7 +8,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['grrb', 'mobile', 'all'],
+                            choices=['grrb', 'all'],
                            help='选择爬取平台: grrb(工人日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_jjrb.py
+++ b/core/management/commands/crawl_jjrb.py
@@ -8,7 +8,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['jjrb', 'mobile', 'all'],
+                            choices=['jjrb', 'all'],
                            help='选择爬取平台: jjrb(经济日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_kjrb.py
+++ b/core/management/commands/crawl_kjrb.py
@@ -9,7 +9,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['kjrb', 'mobile', 'all'],
+                            choices=['kjrb', 'all'],
                            help='选择爬取平台: kjrb(科技日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_nmrb.py
+++ b/core/management/commands/crawl_nmrb.py
@@ -8,7 +8,7 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['nmrb', 'mobile', 'all'],
+                            choices=['nmrb', 'all'],
                            help='选择爬取平台: nmrb(农民日报), all(全部)')

    def handle(self, *args, **options):
--- a/core/management/commands/crawl_pla.py
+++ b/core/management/commands/crawl_pla.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['pla', 'mobile', 'all'],
-                            help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)')
+                            choices=['pla', 'all'],
+                            help='选择爬取平台: pla(解放军报),  all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
--- a/core/management/commands/crawl_rmzxb.py
+++ b/core/management/commands/crawl_rmzxb.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['rmzxb', 'mobile', 'all'],
-                            help='选择爬取平台: rmzxb(人民政协网), mobile(移动端), all(全部)')
+                            choices=['rmzxb', 'all'],
+                            help='选择爬取平台: rmzxb(人民政协网),  all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['news', 'xinhuanet', 'mobile', 'all'],
-                            help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)')
+                            choices=['news', 'all'],
+                            help='选择爬取平台: news(新华网), all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,18 +22,7 @@ class Command(BaseCommand):
                'start_url': 'https://www.news.cn',
                'article_selector': 'a'
            },
-            'xinhuanet': {
-                'name': '新华网主站',
-                'base_url': 'https://www.xinhuanet.com',
-                'start_url': 'https://www.xinhuanet.com',
-                'article_selector': 'a'
-            },
-            'mobile': {
-                'name': '新华社移动端',
-                'base_url': 'https://m.xinhuanet.com',
-                'start_url': 'https://m.xinhuanet.com',
-                'article_selector': 'a'
-            }
+
        }

        if platform == 'all':
--- a/core/management/commands/crawl_xuexi.py
+++ b/core/management/commands/crawl_xuexi.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['xuexi', 'central', 'provincial', 'all'],
-                            help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)')
+                            choices=['xuexi', 'all'],
+                            help='选择爬取平台: xuexi(学习强国主站), all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,18 +22,6 @@ class Command(BaseCommand):
                'start_url': 'https://www.xuexi.cn',
                'article_selector': 'a'
            },
-            'central': {
-                'name': '学习强国中央媒体',
-                'base_url': 'https://www.xuexi.cn',
-                'start_url': 'https://www.xuexi.cn/central',
-                'article_selector': 'a'
-            },
-            'provincial': {
-                'name': '学习强国省级平台',
-                'base_url': 'https://www.xuexi.cn',
-                'start_url': 'https://www.xuexi.cn/provincial',
-                'article_selector': 'a'
-            }
        }

        if platform == 'all':
--- a/core/management/commands/crawl_xxsb.py
+++ b/core/management/commands/crawl_xxsb.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['xxsb', 'mobile', 'all'],
-                            help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)')
+                            choices=['xxsb', 'all'],
+                            help='选择爬取平台: xxsb(学习时报),all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,12 +22,6 @@ class Command(BaseCommand):
                'start_url': 'http://www.studytimes.cn',
                'article_selector': 'a'
            },
-            'mobile': {
-                'name': '学习时报移动端',
-                'base_url': 'http://m.studytimes.cn',
-                'start_url': 'http://m.studytimes.cn',
-                'article_selector': 'a'
-            }
        }

        if platform == 'all':
--- a/core/management/commands/crawl_zgfnb.py
+++ b/core/management/commands/crawl_zgfnb.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['zgfnb', 'mobile', 'all'],
-                            help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)')
+                            choices=['zgfnb', 'all'],
+                            help='选择爬取平台: zgfnb(中国妇女报), all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
                'start_url': 'http://www.cnwomen.com.cn',
                'article_selector': 'a'
            },
-            'mobile': {
-                'name': '中国妇女报移动端',
-                'base_url': 'http://m.cnwomen.com.cn',
-                'start_url': 'http://m.cnwomen.com.cn',
-                'article_selector': 'a'
-            }
+
        }

        if platform == 'all':
--- a/core/management/commands/crawl_zgjwjc.py
+++ b/core/management/commands/crawl_zgjwjc.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['zgjwjc', 'mobile', 'all'],
-                            help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)')
+                            choices=['zgjwjc', 'all'],
+                            help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -18,16 +18,10 @@ class Command(BaseCommand):
        platforms = {
            'zgjwjc': {
                'name': '中国纪检监察报',
-                'base_url': 'http://www.jjjcb.cn',
-                'start_url': 'http://www.jjjcb.cn',
+                'base_url': 'https://jjjcb.ccdi.gov.cn',
+                'start_url': 'https://jjjcb.ccdi.gov.cn',
                'article_selector': 'a'
            },
-            'mobile': {
-                'name': '中国纪检监察报移动端',
-                'base_url': 'http://m.jjjcb.cn',
-                'start_url': 'http://m.jjjcb.cn',
-                'article_selector': 'a'
-            }
        }

        if platform == 'all':
--- a/core/management/commands/crawl_zgqnb.py
+++ b/core/management/commands/crawl_zgqnb.py
@@ -8,8 +8,8 @@ class Command(BaseCommand):

    def add_arguments(self, parser):
        parser.add_argument('--platform', type=str, default='all',
-                            choices=['zgqnb', 'mobile', 'all'],
-                            help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)')
+                            choices=['zgqnb', 'all'],
+                            help='选择爬取平台: zgqnb(中国青年报),  all(全部)')

    def handle(self, *args, **options):
        platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
                'start_url': 'https://www.cyol.com',
                'article_selector': 'a'
            },
-            'mobile': {
-                'name': '中国青年报移动端',
-                'base_url': 'https://m.cyol.com',
-                'start_url': 'https://m.cyol.com',
-                'article_selector': 'a'
-            }
+
        }

        if platform == 'all':
--- a/core/management/commands/export_articles.py
+++ b/core/management/commands/export_articles.py
@@ -6,6 +6,10 @@ import os
 from django.conf import settings
 import zipfile
 from django.utils import timezone
+from bs4 import BeautifulSoup
+# 添加python-docx库支持
+import io
+from docx import Document


 class Command(BaseCommand):
@@ -119,201 +123,100 @@ class Command(BaseCommand):
    # 添加Word格式导出方法
    def export_as_word(self, articles_data, output_path):
        try:
-            from docx import Document
-            from docx.shared import Inches
-        except ImportError:
-            self.stdout.write(self.style.ERROR('缺少python-docx库，请安装: pip install python-docx'))
-            return
-
-        # 创建Word文档
-        doc = Document()
-        doc.add_heading('文章导出', 0)
-
-        for article_data in articles_data:
-            # 添加文章标题
-            doc.add_heading(article_data['title'], level=1)
-
-            # 添加文章元数据
-            doc.add_paragraph(f"网站: {article_data['website']}")
-            doc.add_paragraph(f"URL: {article_data['url']}")
-            doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
-            doc.add_paragraph(f"创建时间: {article_data['created_at']}")
-
-            # 添加文章内容
-            doc.add_heading('内容', level=2)
-            # 简单处理HTML内容，移除标签
-            from bs4 import BeautifulSoup
-            soup = BeautifulSoup(article_data['content'], 'html.parser')
-
-            # 处理内容中的图片
-            for img in soup.find_all('img'):
-                src = img.get('src', '')
-                if src:
-                    # 尝试添加图片到文档
-                    try:
-                        import os
-                        from django.conf import settings
-                        import requests
-                        from io import BytesIO
-
-                        # 构建完整的图片路径
-                        if src.startswith('http'):
-                            # 网络图片
-                            response = requests.get(src, timeout=10)
-                            image_stream = BytesIO(response.content)
-                            doc.add_picture(image_stream, width=Inches(4.0))
-                        else:
-                            # 本地图片
-                            full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
-                            if os.path.exists(full_path):
-                                doc.add_picture(full_path, width=Inches(4.0))
-                    except Exception as e:
-                        # 如果添加图片失败，添加图片URL作为文本
-                        doc.add_paragraph(f"[图片: {src}]")
-
-                # 移除原始img标签
-                img.decompose()
-
-            content_text = soup.get_text()
-            doc.add_paragraph(content_text)
-
-            # 添加媒体文件信息
-            if article_data['media_files']:
-                doc.add_heading('媒体文件', level=2)
-                for media_file in article_data['media_files']:
-                    try:
-                        import os
-                        from django.conf import settings
-                        from io import BytesIO
-                        import requests
-
-                        full_path = os.path.join(settings.MEDIA_ROOT, media_file)
-                        if os.path.exists(full_path):
-                            # 添加图片到文档
-                            doc.add_picture(full_path, width=Inches(4.0))
-                        else:
-                            # 如果是URL格式的媒体文件
-                            if media_file.startswith('http'):
-                                response = requests.get(media_file, timeout=10)
-                                image_stream = BytesIO(response.content)
-                                doc.add_picture(image_stream, width=Inches(4.0))
-                            else:
-                                doc.add_paragraph(media_file)
-                    except Exception as e:
-                        doc.add_paragraph(media_file)
-
-            # 添加分页符
-            doc.add_page_break()
-
-        # 保存文档
-        doc.save(output_path)
+            # 创建一个新的Word文档
+            document = Document()
+            document.add_heading('文章导出', 0)
+            
+            for article_data in articles_data:
+                # 添加文章标题
+                document.add_heading(article_data['title'], level=1)
+                
+                # 添加文章信息
+                document.add_paragraph(f"网站: {article_data['website']}")
+                document.add_paragraph(f"URL: {article_data['url']}")
+                document.add_paragraph(f"发布时间: {article_data['pub_date']}")
+                document.add_paragraph(f"创建时间: {article_data['created_at']}")
+                
+                # 添加内容标题
+                document.add_heading('内容:', level=2)
+                
+                # 处理HTML内容，移除标签
+                soup = BeautifulSoup(article_data['content'], 'html.parser')
+                content_text = soup.get_text()
+                document.add_paragraph(content_text)
+                
+                # 添加分页符分隔文章
+                document.add_page_break()
+            
+            # 保存文档
+            document.save(output_path)
+            self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}'))
+        except Exception as e:
+            self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}'))

    def export_with_media(self, articles_data, media_files, output_path, format_type):
        # 创建ZIP文件
        with zipfile.ZipFile(output_path, 'w') as zipf:
-            # 添加文章数据文件
-            data_filename = f'articles.{format_type}'
-            if format_type == 'json':
-                json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
-                zipf.writestr(data_filename, json_data)
-            elif format_type == 'csv':
-                # 创建CSV内容
-                if articles_data:
-                    import io
-                    csv_buffer = io.StringIO()
-                    fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
-                    writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
-                    writer.writeheader()
-                    for article_data in articles_data:
-                        article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
-                            'media_files'] else ''
-                        writer.writerow(article_data)
-                    zipf.writestr(data_filename, csv_buffer.getvalue())
-            elif format_type == 'docx':
-                # 创建Word文档并保存到ZIP
-                try:
-                    from docx import Document
-                    from docx.shared import Inches
-                    from io import BytesIO
-
-                    doc = Document()
-                    doc.add_heading('文章导出', 0)
-
-                    for article_data in articles_data:
-                        doc.add_heading(article_data['title'], level=1)
+            # 为每篇文章创建独立的文件夹
+            for article_data in articles_data:
+                article_folder = f"article_{article_data['id']}_{article_data['title']}"
+                # 限制文件夹名称长度并移除非法字符
+                article_folder = article_folder[:50].rstrip()
+                article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip()
+                
+                # 添加文章数据文件
+                if format_type == 'docx':
+                    # 创建Word文档并保存到ZIP
+                    data_filename = f'{article_folder}/article.docx'
+                    try:
+                        # 创建文章信息Word文档
+                        doc = Document()
+                        doc.add_heading(article_data['title'], 0)
+                        
+                        # 添加文章信息
                        doc.add_paragraph(f"网站: {article_data['website']}")
                        doc.add_paragraph(f"URL: {article_data['url']}")
                        doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
                        doc.add_paragraph(f"创建时间: {article_data['created_at']}")
-
-                        doc.add_heading('内容', level=2)
-                        from bs4 import BeautifulSoup
+                        
+                        # 添加内容标题
+                        doc.add_heading('内容:', level=1)
+                        
+                        # 处理HTML内容
                        soup = BeautifulSoup(article_data['content'], 'html.parser')
-
-                        # 处理内容中的图片
-                        for img in soup.find_all('img'):
-                            src = img.get('src', '')
-                            if src:
-                                # 尝试添加图片到文档
-                                try:
-                                    import os
-                                    from django.conf import settings
-                                    import requests
-
-                                    # 构建完整的图片路径
-                                    if src.startswith('http'):
-                                        # 网络图片
-                                        response = requests.get(src, timeout=10)
-                                        image_stream = BytesIO(response.content)
-                                        doc.add_picture(image_stream, width=Inches(4.0))
-                                    else:
-                                        # 本地图片
-                                        full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
-                                        if os.path.exists(full_path):
-                                            doc.add_picture(full_path, width=Inches(4.0))
-                                except Exception as e:
-                                    # 如果添加图片失败，添加图片URL作为文本
-                                    doc.add_paragraph(f"[图片: {src}]")
-
-                            # 移除原始img标签
-                            img.decompose()
-
                        content_text = soup.get_text()
                        doc.add_paragraph(content_text)
-
-                        if article_data['media_files']:
-                            doc.add_heading('媒体文件', level=2)
-                            for media_file in article_data['media_files']:
-                                try:
-                                    import os
-                                    from django.conf import settings
-
-                                    full_path = os.path.join(settings.MEDIA_ROOT, media_file)
-                                    if os.path.exists(full_path):
-                                        # 添加图片到文档
-                                        doc.add_picture(full_path, width=Inches(4.0))
-                                    else:
-                                        # 如果是URL格式的媒体文件
-                                        if media_file.startswith('http'):
-                                            response = requests.get(media_file, timeout=10)
-                                            image_stream = BytesIO(response.content)
-                                            doc.add_picture(image_stream, width=Inches(4.0))
-                                        else:
-                                            doc.add_paragraph(media_file)
-                                except Exception as e:
-                                    doc.add_paragraph(media_file)
-
-                        doc.add_page_break()
-
-                    # 将文档保存到内存中再写入ZIP
-                    doc_buffer = BytesIO()
-                    doc.save(doc_buffer)
-                    doc_buffer.seek(0)
-                    zipf.writestr(data_filename, doc_buffer.read())
-                except ImportError:
-                    zipf.writestr(data_filename, "错误：缺少python-docx库，无法生成Word文档")
-
-            # 添加媒体文件
-            for media_path in media_files:
-                arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
-                zipf.write(media_path, arcname)
+                        
+                        # 将文档保存到内存中
+                        doc_buffer = io.BytesIO()
+                        doc.save(doc_buffer)
+                        doc_buffer.seek(0)
+                        
+                        # 将文档添加到ZIP文件
+                        zipf.writestr(data_filename, doc_buffer.getvalue())
+                    except Exception as e:
+                        error_msg = f"错误：无法生成文章Word文档 - {str(e)}"
+                        zipf.writestr(data_filename, error_msg)
+                
+                # 添加媒体文件到文章的media子文件夹
+                if article_data['media_files']:
+                    for media_file in article_data['media_files']:
+                        try:
+                            full_path = os.path.join(settings.MEDIA_ROOT, media_file)
+                            if os.path.exists(full_path):
+                                # 添加媒体文件到ZIP中的media子文件夹
+                                media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
+                                zipf.write(full_path, media_filename)
+                            else:
+                                # 如果是URL格式的媒体文件
+                                if media_file.startswith('http'):
+                                    import requests
+                                    from io import BytesIO
+                                    
+                                    response = requests.get(media_file, timeout=10)
+                                    image_stream = BytesIO(response.content)
+                                    media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
+                                    zipf.writestr(media_filename, image_stream.getvalue())
+                        except Exception as e:
+                            # 错误处理，跳过无法添加的文件
+                            pass
--- a/core/templates/admin/core/article/change_list.html
+++ b/core/templates/admin/core/article/change_list.html
@@ -3,6 +3,7 @@

 {% block object-tools %}
    {{ block.super }}
+    <!--
    <div style="margin-top: 10px;">
        <form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
            {% csrf_token %}
@@ -16,4 +17,5 @@
            <input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
        </form>
    </div>
+    -->
 {% endblock %}
--- a/core/templates/admin/crawler_status.html
+++ b/core/templates/admin/crawler_status.html
@@ -0,0 +1,304 @@
+{% extends "admin/base_site.html" %}
+{% load static %}
+
+{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %}
+
+{% block extrastyle %}
+    <style>
+        .status-card {
+            background: white;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+            padding: 20px;
+            margin: 20px 0;
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+        }
+
+        .status-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 20px;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #f0f0f0;
+        }
+
+        .status-title {
+            font-size: 24px;
+            font-weight: bold;
+            color: #333;
+        }
+
+        .stats-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }
+
+        .stat-card {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            border-radius: 8px;
+            text-align: center;
+        }
+
+        .stat-number {
+            font-size: 32px;
+            font-weight: bold;
+            margin-bottom: 5px;
+        }
+
+        .stat-label {
+            font-size: 14px;
+            opacity: 0.9;
+        }
+
+        .nodes-section, .batches-section {
+            margin-top: 30px;
+        }
+
+        .section-title {
+            font-size: 20px;
+            font-weight: bold;
+            margin-bottom: 15px;
+            color: #333;
+        }
+
+        .node-item, .batch-item {
+            background: #f8f9fa;
+            border: 1px solid #e9ecef;
+            border-radius: 6px;
+            padding: 15px;
+            margin-bottom: 10px;
+        }
+
+        .node-header, .batch-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 10px;
+        }
+
+        .node-name, .batch-id {
+            font-weight: bold;
+            color: #333;
+        }
+
+        .node-status, .batch-status {
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            font-weight: bold;
+        }
+
+        .status-active {
+            background: #d4edda;
+            color: #155724;
+        }
+
+        .status-running {
+            background: #fff3cd;
+            color: #856404;
+        }
+
+        .status-completed {
+            background: #d1ecf1;
+            color: #0c5460;
+        }
+
+        .status-failed {
+            background: #f8d7da;
+            color: #721c24;
+        }
+
+        .node-details, .batch-details {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+            gap: 10px;
+            font-size: 14px;
+        }
+
+        .detail-item {
+            display: flex;
+            justify-content: space-between;
+        }
+
+        .detail-label {
+            color: #666;
+        }
+
+        .detail-value {
+            font-weight: bold;
+            color: #333;
+        }
+
+        .progress-bar {
+            width: 100%;
+            height: 8px;
+            background: #e9ecef;
+            border-radius: 4px;
+            overflow: hidden;
+            margin-top: 10px;
+        }
+
+        .progress-fill {
+            height: 100%;
+            background: linear-gradient(90deg, #28a745, #20c997);
+            transition: width 0.3s ease;
+        }
+
+        .refresh-btn {
+            background: #007bff;
+            color: white;
+            border: none;
+            padding: 8px 16px;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 14px;
+        }
+
+        .refresh-btn:hover {
+            background: #0056b3;
+        }
+
+        .no-data {
+            text-align: center;
+            color: #666;
+            padding: 40px;
+            font-style: italic;
+        }
+    </style>
+{% endblock %}
+
+{% block content %}
+    <div class="status-card">
+        <div class="status-header">
+            <h1 class="status-title">爬虫状态监控</h1>
+            <button class="refresh-btn" onclick="location.reload()">刷新</button>
+        </div>
+
+        <!-- 统计卡片 -->
+        <div class="stats-grid">
+            <div class="stat-card">
+                <div class="stat-number">{{ task_stats.total_nodes }}</div>
+                <div class="stat-label">活跃节点</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-number">{{ task_stats.active_tasks }}</div>
+                <div class="stat-label">运行中任务</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-number">{{ task_stats.total_batches }}</div>
+                <div class="stat-label">总批次</div>
+            </div>
+            <div class="stat-card">
+                <div class="stat-number">{{ nodes|length }}</div>
+                <div class="stat-label">在线节点</div>
+            </div>
+        </div>
+
+        <!-- 节点状态 -->
+        <div class="nodes-section">
+            <h2 class="section-title">爬虫节点状态</h2>
+            {% if nodes %}
+                {% for node in nodes %}
+                    <div class="node-item">
+                        <div class="node-header">
+                            <span class="node-name">{{ node.node_id }}</span>
+                            <span class="node-status status-active">{{ node.status }}</span>
+                        </div>
+                        <div class="node-details">
+                            <div class="detail-item">
+                                <span class="detail-label">活跃任务:</span>
+                                <span class="detail-value">{{ node.active_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">完成任务:</span>
+                                <span class="detail-value">{{ node.completed_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">失败任务:</span>
+                                <span class="detail-value">{{ node.failed_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">最后心跳:</span>
+                                <span class="detail-value">
+                            {% if node.last_heartbeat %}
+                                {{ node.last_heartbeat|date:"H:i:s" }}
+                            {% else %}
+                                未知
+                            {% endif %}
+                        </span>
+                            </div>
+                        </div>
+                    </div>
+                {% endfor %}
+            {% else %}
+                <div class="no-data">
+                    暂无活跃的爬虫节点
+                </div>
+            {% endif %}
+        </div>
+
+        <!-- 批次状态 -->
+        <div class="batches-section">
+            <h2 class="section-title">最近批次</h2>
+            {% if batches %}
+                {% for batch in batches %}
+                    <div class="batch-item">
+                        <div class="batch-header">
+                            <span class="batch-id">{{ batch.batch_id }}</span>
+                            <span class="batch-status status-{{ batch.status }}">
+                        {% if batch.status == 'running' %}
+                            运行中
+                        {% elif batch.status == 'completed' %}
+                            已完成
+                        {% elif batch.status == 'failed' %}
+                            失败
+                        {% else %}
+                            {{ batch.status }}
+                        {% endif %}
+                    </span>
+                        </div>
+                        <div class="batch-details">
+                            <div class="detail-item">
+                                <span class="detail-label">总任务:</span>
+                                <span class="detail-value">{{ batch.total_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">已完成:</span>
+                                <span class="detail-value">{{ batch.completed_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">失败:</span>
+                                <span class="detail-value">{{ batch.failed_tasks }}</span>
+                            </div>
+                            <div class="detail-item">
+                                <span class="detail-label">进度:</span>
+                                <span class="detail-value">{{ batch.progress|floatformat:1 }}%</span>
+                            </div>
+                        </div>
+                        {% if batch.status == 'running' %}
+                            <div class="progress-bar">
+                                <div class="progress-fill" style="width: {{ batch.progress }}%"></div>
+                            </div>
+                        {% endif %}
+                    </div>
+                {% endfor %}
+            {% else %}
+                <div class="no-data">
+                    暂无批次记录
+                </div>
+            {% endif %}
+        </div>
+    </div>
+
+    <script>
+        // 自动刷新页面
+        setTimeout(function () {
+            location.reload();
+        }, 30000); // 30秒刷新一次
+    </script>
+{% endblock %}
--- a/core/templates/core/article_detail.html
+++ b/core/templates/core/article_detail.html
@@ -40,7 +40,16 @@
            margin-top: 20px;
        }

-        .content img {
+        /* 优化:确保图片和视频不会超出容器显示 */
+        .content img, .content video {
+            max-width: 100%;
+            height: auto;
+            display: block;
+            margin: 10px 0;
+        }
+
+        /* 优化:确保iframe也不会超出容器显示 */
+        .content iframe {
            max-width: 100%;
            height: auto;
        }
@@ -61,7 +70,7 @@
            body {
                padding: 10px;
            }
-            
+
            .container {
                padding: 15px;
            }
@@ -69,21 +78,21 @@
    </style>
 </head>
 <body>
-    <div class="container">
-        <a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
+<div class="container">
+    <a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>

-        <h1>{{ article.title }}</h1>
+    <h1>{{ article.title }}</h1>

-        <div class="meta">
-            网站: {{ article.website.name }} |
-            发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
-            创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
-            源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
-        </div>
-
-        <div class="content">
-            {{ article.content|safe }}
-        </div>
+    <div class="meta">
+        网站: {{ article.website.name }} |
+        发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
+        创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
+        源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
    </div>
+
+    <div class="content">
+        {{ article.content|safe }}
+    </div>
+</div>
 </body>
 </html>
--- a/core/templates/core/article_list.html
+++ b/core/templates/core/article_list.html
@@ -17,7 +17,7 @@
            background: white;
            padding: 30px;
            margin-bottom: 20px;
-            box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
            border-radius: 8px; /* 添加圆角 */
        }

@@ -240,7 +240,7 @@
        <form method="get">
            <input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
            {% if selected_website %}
-            <input type="hidden" name="website" value="{{ selected_website.id }}">
+                <input type="hidden" name="website" value="{{ selected_website.id }}">
            {% endif %}
            <input type="submit" value="搜索">
        </form>
@@ -251,9 +251,11 @@
        <div class="sidebar">
            <div class="filters">
                <strong>按网站筛选：</strong>
-                <a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
+                <a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
+                   {% if not selected_website %}class="active" {% endif %}>全部</a>
                {% for website in websites %}
-                <a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
+                    <a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
+                       {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
                {% endfor %}
            </div>
        </div>
@@ -262,10 +264,10 @@
        <div class="main-content">
            <!-- 新增:搜索结果信息 -->
            {% if search_query %}
-            <div class="search-info">
-                搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
-                <a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
-            </div>
+                <div class="search-info">
+                    搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
+                    <a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
+                </div>
            {% endif %}

            <!-- 新增:导出功能 -->
@@ -280,60 +282,70 @@

            <ul>
                {% for article in page_obj %}
-                <li>
-                    <input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
-                    <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
-                    <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
-                </li>
-                {% empty %}
-                <li>暂无文章</li>
+                    <li>
+                        <input type="checkbox" class="article-checkbox" value="{{ article.id }}"
+                               id="article_{{ article.id }}">
+                        <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
+                        <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
+                    </li>
+                    {% empty %}
+                    <li>暂无文章</li>
                {% endfor %}
            </ul>

            <div class="pagination">
                {% if page_obj.has_previous %}
-                {% if selected_website %}
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
-                {% else %}
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
-                {% endif %}
+                    {% if selected_website %}
+                        <a href="?website=
+                                {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
+                            首页</a>
+                        <a href="?website=
+                                {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
+                    {% else %}
+                        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
+                        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
+                    {% endif %}
                {% endif %}

                <span>第 {{ page_obj.number }} 页，共 {{ page_obj.paginator.num_pages }} 页</span>

                <!-- 修改:优化页码显示逻辑 -->
                {% with page_obj.paginator as paginator %}
-                {% for num in paginator.page_range %}
-                {% if page_obj.number == num %}
-                <a href="#" class="current">{{ num }}</a>
-                {% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
-                {% if selected_website %}
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
-                {% else %}
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
-                {% endif %}
-                {% elif num == 1 or num == paginator.num_pages %}
-                {% if selected_website %}
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
-                {% else %}
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
-                {% endif %}
-                {% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
-                <span class="ellipsis">...</span>
-                {% endif %}
-                {% endfor %}
+                    {% for num in paginator.page_range %}
+                        {% if page_obj.number == num %}
+                            <a href="#" class="current">{{ num }}</a>
+                        {% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
+                            {% if selected_website %}
+                                <a href="?website=
+                                        {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
+                            {% else %}
+                                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
+                            {% endif %}
+                        {% elif num == 1 or num == paginator.num_pages %}
+                            {% if selected_website %}
+                                <a href="?website=
+                                        {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
+                            {% else %}
+                                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
+                            {% endif %}
+                        {% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
+                            <span class="ellipsis">...</span>
+                        {% endif %}
+                    {% endfor %}
                {% endwith %}

                {% if page_obj.has_next %}
-                {% if selected_website %}
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
-                <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
-                {% else %}
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
-                <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
-                {% endif %}
+                    {% if selected_website %}
+                        <a href="?website=
+                                {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
+                        <a href="?website=
+                                {{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
+                            &raquo;</a>
+                    {% else %}
+                        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
+                        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
+                            &raquo;</a>
+                    {% endif %}
                {% endif %}
            </div>
        </div>
@@ -396,25 +408,25 @@
                format: 'json'
            })
        })
-        .then(response => {
-            if (response.ok) {
-                return response.blob();
-            }
-            throw new Error('导出失败');
-        })
-        .then(blob => {
-            const url = window.URL.createObjectURL(blob);
-            const a = document.createElement('a');
-            a.href = url;
-            a.download = 'articles.json';
-            document.body.appendChild(a);
-            a.click();
-            window.URL.revokeObjectURL(url);
-            document.body.removeChild(a);
-        })
-        .catch(error => {
-            alert('导出失败: ' + error);
-        });
+            .then(response => {
+                if (response.ok) {
+                    return response.blob();
+                }
+                throw new Error('导出失败');
+            })
+            .then(blob => {
+                const url = window.URL.createObjectURL(blob);
+                const a = document.createElement('a');
+                a.href = url;
+                a.download = 'articles.json';
+                document.body.appendChild(a);
+                a.click();
+                window.URL.revokeObjectURL(url);
+                document.body.removeChild(a);
+            })
+            .catch(error => {
+                alert('导出失败: ' + error);
+            });
    });

    // 导出为CSV功能
@@ -434,25 +446,25 @@
                format: 'csv'
            })
        })
-        .then(response => {
-            if (response.ok) {
-                return response.blob();
-            }
-            throw new Error('导出失败');
-        })
-        .then(blob => {
-            const url = window.URL.createObjectURL(blob);
-            const a = document.createElement('a');
-            a.href = url;
-            a.download = 'articles.csv';
-            document.body.appendChild(a);
-            a.click();
-            window.URL.revokeObjectURL(url);
-            document.body.removeChild(a);
-        })
-        .catch(error => {
-            alert('导出失败: ' + error);
-        });
+            .then(response => {
+                if (response.ok) {
+                    return response.blob();
+                }
+                throw new Error('导出失败');
+            })
+            .then(blob => {
+                const url = window.URL.createObjectURL(blob);
+                const a = document.createElement('a');
+                a.href = url;
+                a.download = 'articles.csv';
+                document.body.appendChild(a);
+                a.click();
+                window.URL.revokeObjectURL(url);
+                document.body.removeChild(a);
+            })
+            .catch(error => {
+                alert('导出失败: ' + error);
+            });
    });

    // 新增:导出为ZIP包功能
@@ -472,25 +484,25 @@
                format: 'zip' // 指定导出格式为ZIP
            })
        })
-        .then(response => {
-            if (response.ok) {
-                return response.blob();
-            }
-            throw new Error('导出失败');
-        })
-        .then(blob => {
-            const url = window.URL.createObjectURL(blob);
-            const a = document.createElement('a');
-            a.href = url;
-            a.download = 'articles.zip';
-            document.body.appendChild(a);
-            a.click();
-            window.URL.revokeObjectURL(url);
-            document.body.removeChild(a);
-        })
-        .catch(error => {
-            alert('导出失败: ' + error);
-        });
+            .then(response => {
+                if (response.ok) {
+                    return response.blob();
+                }
+                throw new Error('导出失败');
+            })
+            .then(blob => {
+                const url = window.URL.createObjectURL(blob);
+                const a = document.createElement('a');
+                a.href = url;
+                a.download = 'articles.zip';
+                document.body.appendChild(a);
+                a.click();
+                window.URL.revokeObjectURL(url);
+                document.body.removeChild(a);
+            })
+            .catch(error => {
+                alert('导出失败: ' + error);
+            });
    });

    // 初始化导出按钮状态
--- a/core/tests.py
+++ b/core/tests.py
@@ -1,3 +1,312 @@
-from django.test import TestCase
+import os
+import tempfile
+import shutil
+from django.test import TestCase, override_settings
+from django.core.management import call_command
+from django.core.management.base import CommandError
+from django.utils import timezone
+from django.core.files.uploadedfile import SimpleUploadedFile
+from unittest.mock import patch, MagicMock
+from .models import Website, Article
+from .utils import process_article, download_media, is_valid_url, full_site_crawler
+from .tasks import crawl_website, crawl_all_websites, health_check

-# Create your tests here.
+
+class WebsiteModelTest(TestCase):
+    """网站模型测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='测试网站',
+            base_url='https://test.com',
+            description='测试描述'
+        )
+
+    def test_website_creation(self):
+        """测试网站创建"""
+        self.assertEqual(self.website.name, '测试网站')
+        self.assertEqual(self.website.base_url, 'https://test.com')
+        self.assertTrue(self.website.enabled)
+
+    def test_website_str(self):
+        """测试网站字符串表示"""
+        self.assertEqual(str(self.website), '测试网站')
+
+
+class ArticleModelTest(TestCase):
+    """文章模型测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='测试网站',
+            base_url='https://test.com'
+        )
+        self.article = Article.objects.create(
+            website=self.website,
+            title='测试文章',
+            url='https://test.com/article/1',
+            content='<p>测试内容</p>',
+            media_files=['image1.jpg', 'image2.jpg']
+        )
+
+    def test_article_creation(self):
+        """测试文章创建"""
+        self.assertEqual(self.article.title, '测试文章')
+        self.assertEqual(self.article.url, 'https://test.com/article/1')
+        self.assertEqual(len(self.article.media_files), 2)
+
+    def test_article_str(self):
+        """测试文章字符串表示"""
+        self.assertEqual(str(self.article), '测试文章')
+
+
+class UtilsTest(TestCase):
+    """工具函数测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='测试网站',
+            base_url='https://test.com'
+        )
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir)
+
+    def test_is_valid_url(self):
+        """测试URL验证"""
+        from .utils import is_valid_url
+
+        # 有效URL
+        self.assertTrue(is_valid_url('https://test.com/article', 'test.com'))
+        self.assertTrue(is_valid_url('http://test.com/article', 'test.com'))
+
+        # 无效URL
+        self.assertFalse(is_valid_url('https://other.com/article', 'test.com'))
+        self.assertFalse(is_valid_url('ftp://test.com/article', 'test.com'))
+        self.assertFalse(is_valid_url('invalid-url', 'test.com'))
+
+    @patch('core.utils.requests.get')
+    def test_download_media(self, mock_get):
+        """测试媒体下载"""
+        # 模拟响应
+        mock_response = MagicMock()
+        mock_response.content = b'fake image content'
+        mock_response.headers = {'content-type': 'image/jpeg'}
+        mock_get.return_value = mock_response
+
+        # 测试下载
+        result = download_media('https://test.com/image.jpg', self.temp_dir)
+        self.assertIsNotNone(result)
+        self.assertTrue(os.path.exists(result))
+
+    @patch('core.utils.requests.get')
+    @patch('core.utils.download_media')
+    def test_process_article_success(self, mock_download_media, mock_get):
+        """测试文章处理成功"""
+        # 模拟HTML响应
+        html_content = '''
+        <html>
+            <head><title>测试文章</title></head>
+            <body>
+                <h1>测试文章标题</h1>
+                <div class="content">
+                    <p>测试文章内容</p>
+                    <img src="https://test.com/image.jpg">
+                </div>
+            </body>
+        </html>
+        '''
+
+        mock_response = MagicMock()
+        mock_response.text = html_content
+        mock_response.encoding = 'utf-8'
+        mock_response.raise_for_status.return_value = None
+        mock_get.return_value = mock_response
+
+        # 模拟媒体下载
+        mock_download_media.return_value = '/tmp/test_image.jpg'
+
+        # 测试文章处理
+        process_article('https://test.com/article/1', self.website)
+
+        # 验证文章是否保存
+        article = Article.objects.filter(url='https://test.com/article/1').first()
+        self.assertIsNotNone(article)
+        self.assertEqual(article.title, '测试文章标题')
+
+
+class ManagementCommandsTest(TestCase):
+    """管理命令测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='测试网站',
+            base_url='https://test.com'
+        )
+
+    @patch('core.management.commands.crawl_all_media.call_command')
+    def test_crawl_all_media_command(self, mock_call_command):
+        """测试批量爬取命令"""
+        # 模拟命令执行
+        mock_call_command.return_value = None
+
+        # 执行命令
+        call_command('crawl_all_media', media='rmrb,xinhua')
+
+        # 验证命令被调用
+        mock_call_command.assert_called()
+
+
+class CeleryTasksTest(TestCase):
+    """Celery任务测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='测试网站',
+            base_url='https://test.com'
+        )
+
+    @patch('core.tasks.full_site_crawler')
+    def test_crawl_website_task(self, mock_crawler):
+        """测试单个网站爬取任务"""
+        # 模拟爬虫函数
+        mock_crawler.return_value = None
+
+        # 执行任务
+        result = crawl_website(self.website.id)
+
+        # 验证结果
+        self.assertEqual(result['website_id'], self.website.id)
+        self.assertEqual(result['website_name'], '测试网站')
+        self.assertEqual(result['status'], 'success')
+
+    def test_crawl_website_task_invalid_id(self):
+        """测试无效网站ID的任务"""
+        # 执行任务
+        with self.assertRaises(Exception):
+            crawl_website(99999)
+
+    @patch('core.tasks.crawl_website.delay')
+    def test_crawl_all_websites_task(self, mock_delay):
+        """测试批量爬取任务"""
+        # 模拟子任务
+        mock_result = MagicMock()
+        mock_result.id = 'task-123'
+        mock_delay.return_value = mock_result
+
+        # 执行任务
+        result = crawl_all_websites()
+
+        # 验证结果
+        self.assertEqual(result['total_websites'], 1)
+        self.assertEqual(result['status'], 'started')
+
+    def test_health_check_task(self):
+        """测试健康检查任务"""
+        # 执行任务
+        result = health_check()
+
+        # 验证结果
+        self.assertEqual(result['database'], 'ok')
+        self.assertEqual(result['website_count'], 1)
+        self.assertEqual(result['article_count'], 0)
+
+
+class IntegrationTest(TestCase):
+    """集成测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='集成测试网站',
+            base_url='https://integration-test.com'
+        )
+
+    def test_full_workflow(self):
+        """测试完整工作流程"""
+        # 1. 创建网站
+        self.assertEqual(Website.objects.count(), 1)
+
+        # 2. 创建文章
+        article = Article.objects.create(
+            website=self.website,
+            title='集成测试文章',
+            url='https://integration-test.com/article/1',
+            content='<p>集成测试内容</p>'
+        )
+
+        # 3. 验证关联关系
+        self.assertEqual(article.website, self.website)
+        self.assertEqual(self.website.article_set.count(), 1)
+
+        # 4. 验证数据完整性
+        self.assertIsNotNone(article.created_at)
+        self.assertIsInstance(article.media_files, list)
+
+
+@override_settings(MEDIA_ROOT=tempfile.mkdtemp())
+class MediaHandlingTest(TestCase):
+    """媒体文件处理测试"""
+
+    def setUp(self):
+        self.website = Website.objects.create(
+            name='媒体测试网站',
+            base_url='https://media-test.com'
+        )
+
+    def test_media_files_field(self):
+        """测试媒体文件字段"""
+        article = Article.objects.create(
+            website=self.website,
+            title='媒体测试文章',
+            url='https://media-test.com/article/1',
+            content='<p>测试内容</p>',
+            media_files=['image1.jpg', 'video1.mp4']
+        )
+
+        # 验证媒体文件列表
+        self.assertEqual(len(article.media_files), 2)
+        self.assertIn('image1.jpg', article.media_files)
+        self.assertIn('video1.mp4', article.media_files)
+
+
+class ErrorHandlingTest(TestCase):
+    """错误处理测试"""
+
+    def test_duplicate_url_handling(self):
+        """测试重复URL处理"""
+        website = Website.objects.create(
+            name='错误测试网站',
+            base_url='https://error-test.com'
+        )
+
+        # 创建第一篇文章
+        article1 = Article.objects.create(
+            website=website,
+            title='第一篇文章',
+            url='https://error-test.com/article/1',
+            content='<p>内容1</p>'
+        )
+
+        # 尝试创建相同URL的文章
+        with self.assertRaises(Exception):
+            Article.objects.create(
+                website=website,
+                title='第二篇文章',
+                url='https://error-test.com/article/1',  # 相同URL
+                content='<p>内容2</p>'
+            )
+
+    def test_invalid_website_data(self):
+        """测试无效网站数据"""
+        # 测试重复名称（unique约束）
+        Website.objects.create(
+            name='测试网站1',
+            base_url='https://test1.com'
+        )
+
+        with self.assertRaises(Exception):
+            Website.objects.create(
+                name='测试网站1',  # 重复名称
+                base_url='https://test2.com'
+            )
--- a/core/urls.py
+++ b/core/urls.py
@@ -1,17 +1,24 @@
-from django.urls import path, include
-from . import views
-# 添加以下导入
-from django.contrib import admin
+from django.urls import path
+from . import views, api

 urlpatterns = [
+    # 原有视图
    path('', views.article_list, name='article_list'),
    path('article/<int:article_id>/', views.article_detail, name='article_detail'),
-    path('run-crawler/', views.run_crawler, name='run_crawler'),
-    # 新增:检查爬虫状态的路由
-    path('crawler-status/', views.crawler_status, name='crawler_status'),
-    # 新增:暂停爬虫的路由
-    path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
-    # 添加导出文章的路由
-    path('export-articles/', views.export_articles, name='export_articles'),
-    # 添加自定义管理后台的路由
-]
+
+    # API接口
+    path('api/health/', api.HealthView.as_view(), name='api_health'),
+    path('api/websites/', api.WebsitesView.as_view(), name='api_websites'),
+    path('api/websites/<int:website_id>/', api.api_website_detail, name='api_website_detail'),
+    path('api/websites/<int:website_id>/crawl/', api.api_crawl_website, name='api_crawl_website'),
+    path('api/articles/', api.api_articles, name='api_articles'),
+    path('api/articles/<int:article_id>/', api.api_article_detail, name='api_article_detail'),
+    path('api/crawler/status/', api.api_crawler_status, name='api_crawler_status'),
+    path('api/crawler/distributed/', api.api_start_distributed_crawl, name='api_start_distributed_crawl'),
+    path('api/crawler/batch/<str:batch_id>/', api.api_batch_status, name='api_batch_status'),
+    path('api/cleanup/', api.api_cleanup_articles, name='api_cleanup_articles'),
+    path('api/stats/', api.api_stats, name='api_stats'),
+    
+    # 添加导出文章的URL
+    path('api/export/', api.export_articles, name='export_articles'),
+]
--- a/core/utils.py
+++ b/core/utils.py
@@ -26,8 +26,9 @@ def get_selenium_driver():
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
-        
+        chrome_options.add_argument(
+            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        return driver
@@ -35,6 +36,7 @@ def get_selenium_driver():
        print(f"创建Selenium WebDriver失败: {e}")
        return None

+
 def get_page_with_selenium(url, website_name):
    """使用Selenium获取动态加载的页面内容"""
    driver = None
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
        driver = get_selenium_driver()
        if not driver:
            return None
-        
+
        print(f"使用Selenium加载页面: {url}")
        driver.get(url)
-        
+
        # 等待页面加载完成
        wait_time = 10
        if "学习强国" in website_name:
            wait_time = 15  # 学习强国需要更长时间
        elif "法治日报" in website_name:
            wait_time = 12  # 法治日报需要较长时间
-        
+
        # 等待页面主要内容加载
        try:
            WebDriverWait(driver, wait_time).until(
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
            )
        except:
            print(f"等待页面加载超时: {url}")
-        
+
        # 额外等待时间确保动态内容加载完成
        time.sleep(3)
-        
+
        # 获取页面源码
        page_source = driver.page_source
        return page_source
-        
+
    except Exception as e:
        print(f"Selenium获取页面失败: {url}, 错误: {e}")
        return None
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
            except:
                pass

+
 def download_media(url, save_dir):
    try:
        # 添加请求头以避免403 Forbidden错误
@@ -236,7 +239,7 @@ def process_article(url, website):
    need_selenium = False
    if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
        need_selenium = True
-    
+
    try:
        if need_selenium:
            # 使用Selenium获取动态加载的内容
@@ -244,28 +247,28 @@ def process_article(url, website):
            if not page_source:
                print(f"Selenium获取页面失败：{url}")
                return
-            
+
            # 检查页面内容是否过短
            min_length = 100 if "法治日报" in website.name else 300
            if len(page_source) < min_length:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
-            
+
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(page_source, "html.parser")
        else:
            # 使用requests获取静态内容
            resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
            resp.raise_for_status()
-            
+
            # 检查是否是重定向页面
            if len(resp.text) < 300:
                print(f"页面内容过短，可能是重定向页面：{url}")
                return
-            
+
            # 创建BeautifulSoup对象
            soup = BeautifulSoup(resp.text, "html.parser")
-            
+
    except Exception as e:
        print(f"请求失败：{url}，错误：{e}")
        return
@@ -353,7 +356,7 @@ def process_article(url, website):
                    heading_text = heading.get_text(strip=True)
                    if title_text in heading_text or heading_text in title_text:
                        heading.decompose()
-                
+
                # 移除class包含title的元素
                for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
                    title_element_text = title_element.get_text(strip=True)
@@ -489,13 +492,13 @@ def process_article(url, website):
                soup.find("p", class_="title") or
                soup.find("title")
        )
-        
+
        # 针对求是的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 针对求是的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
@@ -522,7 +525,7 @@ def process_article(url, website):
                            # 如果 strong 在正文前两段内，就删除
                            if parent_p in content_tag.find_all("p")[:2]:
                                strong_tag.decompose()
-                    
+
                    # 移除h1、h2、h3标题元素中的重复标题
                    for heading in content_tag.find_all(["h1", "h2", "h3"]):
                        heading_text = heading.get_text(strip=True)
@@ -530,11 +533,12 @@ def process_article(url, website):
                            # 确保不删除title_tag本身
                            if heading != title_tag:
                                heading.decompose()
-                    
+
                    # 移除class包含title的元素
                    for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
                        title_element_text = title_element.get_text(strip=True)
-                        if title_element_text and (title_text in title_element_text or title_element_text in title_text):
+                        if title_element_text and (
+                                title_text in title_element_text or title_element_text in title_text):
                            # 确保不删除title_tag本身
                            if title_element != title_tag:
                                title_element.decompose()
@@ -583,7 +587,7 @@ def process_article(url, website):
                soup.find("h2") or  # 解放军报使用h2标签作为标题
                soup.find("title")
        )
-        
+
        # 针对解放军报的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
@@ -606,34 +610,34 @@ def process_article(url, website):
            # 移除面包屑导航
            for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
                breadcrumb.decompose()
-            
+
            # 移除分享相关元素
            for share_element in content_tag.find_all("div", class_="share-custom"):
                share_element.decompose()
-            
+
            # 移除作者信息段落
            for author_p in content_tag.find_all("p"):
                text = author_p.get_text(strip=True)
                if "来源：" in text or "作者：" in text or "责任编辑：" in text or "发布：" in text:
                    author_p.decompose()
-            
+
            # 移除进度条
            for progress in content_tag.find_all("div", class_="progress-bar"):
                progress.decompose()
-            
+
            # 移除播放器
            for player in content_tag.find_all("div", class_="player"):
                player.decompose()
-            
+
            # 移除媒体URL容器
            for media in content_tag.find_all("div", id="mediaurl"):
                media.decompose()
-            
+
            # 移除新闻列表（但保留其中的内容）
            for news_list in content_tag.find_all("ul", id="main-news-list"):
                # 不删除整个ul，而是unwrap它，保留其中的内容
                news_list.unwrap()
-            
+
            # 移除编辑信息
            for editor_element in content_tag.find_all("div", class_="editor"):
                editor_element.decompose()
@@ -744,13 +748,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对工人日报的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 进一步处理：如果h1标题包含太多无关信息，尝试从title标签提取更简洁的标题
        if title_tag and title_tag.name == 'h1':
            title_text = title_tag.get_text(strip=True)
@@ -877,11 +881,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国纪检监察报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -955,11 +959,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国青年报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="main") or  # 中国青年报特有内容容器
                soup.find("div", class_="content") or
@@ -977,11 +981,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国妇女报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="main") or  # 中国妇女报特有内容容器
                soup.find("div", class_="news") or  # 中国妇女报特有内容容器
@@ -1001,11 +1005,11 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对法治日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content-two") or  # 优先查找content-two类
                soup.find("div", class_="article-content") or  # 法治日报特有内容容器
@@ -1058,13 +1062,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对农民日报的特殊处理，如果标题出现乱码，尝试从title标签提取
        if title_tag and title_tag.name == 'h1':
            title_text = title_tag.get_text(strip=True)
            if title_text and any(char in title_text for char in ['', '', '']):
                title_tag = soup.find("title")
-        
+
        # 针对农民日报的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
@@ -1078,7 +1082,7 @@ def process_article(url, website):
                soup.find("div", class_="article") or
                soup.find("div", class_="article-body")
        )
-        
+
        # 针对农民日报的特殊处理，如果找到多个detailCon，选择内容最长的那个
        if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
            detail_cons = soup.find_all("div", class_="detailCon")
@@ -1116,17 +1120,17 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对学习强国的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        # 针对学习强国的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if title_text and len(title_text) < 10:
                title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="content") or
                soup.find("div", class_="article-content") or
@@ -1153,17 +1157,17 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对旗帜网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        # 针对旗帜网的特殊处理，确保标题被正确提取
        if not title_tag or not title_tag.get_text(strip=True):
            title_tag = soup.find("title")
-        
+
        # 针对旗帜网的特殊处理，如果标题太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
@@ -1232,13 +1236,13 @@ def process_article(url, website):
                soup.find("h1") or
                soup.find("title")
        )
-        
+
        # 针对中国网的特殊处理，如果标题为空或太短，尝试从title标签提取
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if not title_text or len(title_text) < 5:
                title_tag = soup.find("title")
-        
+
        content_tag = (
                soup.find("div", class_="article") or  # 中国网特有内容容器
                soup.find("div", class_="main") or
@@ -1281,7 +1285,7 @@ def process_article(url, website):
    # 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
    if not title_tag:
        title_tag = soup.find("h1") or soup.find("title")
-    
+
    title = title_tag.get_text(strip=True) if title_tag else "无标题"

    # 对标题进行额外处理，去除可能的多余空白字符
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
                    ("/content/" in path) or
                    (path.startswith("/detail/") and len(path) > 10)
            )
-            
+
            # 排除列表页面
            if "/index.html" in path or path.endswith("/"):
                is_article_page = False
--- a/core/views.py
+++ b/core/views.py
@@ -412,4 +412,4 @@ def export_articles(request):
            return HttpResponse('不支持的格式', status=400)

    except Exception as e:
-        return HttpResponse(f'导出失败: {str(e)}', status=500)
+        return HttpResponse(f'导出失败: {str(e)}', status=500)
--- a/green_classroom/settings.py
+++ b/green_classroom/settings.py
@@ -10,7 +10,12 @@ For the full list of settings and their values, see
 https://docs.djangoproject.com/en/5.1/ref/settings/
 """

+import os
 from pathlib import Path
+from dotenv import load_dotenv
+
+# 加载环境变量
+load_dotenv()

 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
@@ -19,12 +24,12 @@ BASE_DIR = Path(__file__).resolve().parent.parent
 # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/

 # SECURITY WARNING: keep the secret key used in production secret!
-SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
+SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl')

 # SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = True
+DEBUG = os.getenv('DEBUG', 'True').lower() == 'true'

-ALLOWED_HOSTS = []
+ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1').split(',')

 # Application definition

@@ -36,8 +41,15 @@ INSTALLED_APPS = [
    'django.contrib.messages',
    'django.contrib.staticfiles',
    'core',
+    'django_celery_beat',
+    'django_celery_results',
+    'rest_framework',
+    'rest_framework.authtoken',
 ]

+# 导入Admin扩展
+# import core.admin_extended  # 暂时注释，避免循环导入
+
 MIDDLEWARE = [
    'django.middleware.security.SecurityMiddleware',
    'django.contrib.sessions.middleware.SessionMiddleware',
@@ -71,12 +83,30 @@ WSGI_APPLICATION = 'green_classroom.wsgi.application'
 # Database
 # https://docs.djangoproject.com/en/5.1/ref/settings/#databases

-DATABASES = {
-    'default': {
-        'ENGINE': 'django.db.backends.sqlite3',
-        'NAME': BASE_DIR / 'db.sqlite3',
+# 根据环境变量选择数据库
+DB_ENGINE = os.getenv('DB_ENGINE', 'django.db.backends.sqlite3')
+
+if DB_ENGINE == 'django.db.backends.postgresql':
+    DATABASES = {
+        'default': {
+            'ENGINE': DB_ENGINE,
+            'NAME': os.getenv('DB_NAME', 'green_classroom'),
+            'USER': os.getenv('DB_USER', 'postgres'),
+            'PASSWORD': os.getenv('DB_PASSWORD', ''),
+            'HOST': os.getenv('DB_HOST', 'localhost'),
+            'PORT': os.getenv('DB_PORT', '5432'),
+            'OPTIONS': {
+                'charset': 'utf8mb4',
+            },
+        }
+    }
+else:
+    DATABASES = {
+        'default': {
+            'ENGINE': 'django.db.backends.sqlite3',
+            'NAME': BASE_DIR / 'db.sqlite3',
+        }
    }
-}

 # Password validation
 # https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
@@ -110,17 +140,118 @@ USE_TZ = True
 # Static files (CSS, JavaScript, Images)
 # https://docs.djangoproject.com/en/5.1/howto/static-files/

-STATIC_URL = 'static/'
+STATIC_URL = '/static/'
+STATIC_ROOT = os.getenv('STATIC_ROOT', os.path.join(BASE_DIR, 'data', 'static'))

 # Default primary key field type
 # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field

 DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

-
-import os
-
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
+# 媒体文件配置
+MEDIA_ROOT = os.getenv('MEDIA_ROOT', os.path.join(BASE_DIR, 'data', 'media'))
 MEDIA_URL = '/media/'
+
+# Celery配置
+CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://localhost:6379/0')
+CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
+CELERY_ACCEPT_CONTENT = ['json']
+CELERY_TASK_SERIALIZER = 'json'
+CELERY_RESULT_SERIALIZER = 'json'
+CELERY_TIMEZONE = TIME_ZONE
+CELERY_TASK_TRACK_STARTED = True
+CELERY_TASK_TIME_LIMIT = 30 * 60  # 30分钟
+
+# Redis配置
+REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0')
+
+# 日志配置
+LOGGING = {
+    'version': 1,
+    'disable_existing_loggers': False,
+    'formatters': {
+        'verbose': {
+            'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
+            'style': '{',
+        },
+        'simple': {
+            'format': '{levelname} {message}',
+            'style': '{',
+        },
+    },
+    'handlers': {
+        'file': {
+            'level': os.getenv('LOG_LEVEL', 'INFO'),
+            'class': 'logging.FileHandler',
+            'filename': os.getenv('LOG_FILE', os.path.join(BASE_DIR, 'data', 'logs', 'django.log')),
+            'formatter': 'verbose',
+        },
+        'console': {
+            'level': os.getenv('LOG_LEVEL', 'INFO'),
+            'class': 'logging.StreamHandler',
+            'formatter': 'simple',
+        },
+    },
+    'root': {
+        'handlers': ['console', 'file'],
+        'level': os.getenv('LOG_LEVEL', 'INFO'),
+    },
+    'loggers': {
+        'django': {
+            'handlers': ['console', 'file'],
+            'level': os.getenv('LOG_LEVEL', 'INFO'),
+            'propagate': False,
+        },
+        'core': {
+            'handlers': ['console', 'file'],
+            'level': os.getenv('LOG_LEVEL', 'INFO'),
+            'propagate': False,
+        },
+    },
+}
+
+# 安全设置
+if not DEBUG:
+    SECURE_BROWSER_XSS_FILTER = True
+    SECURE_CONTENT_TYPE_NOSNIFF = True
+    X_FRAME_OPTIONS = 'DENY'
+    SECURE_HSTS_SECONDS = 31536000
+    SECURE_HSTS_INCLUDE_SUBDOMAINS = True
+    SECURE_HSTS_PRELOAD = True
+
+# 爬虫设置
+CRAWLER_TIMEOUT = int(os.getenv('CRAWLER_TIMEOUT', 30))
+CRAWLER_MAX_RETRIES = int(os.getenv('CRAWLER_MAX_RETRIES', 3))
+CRAWLER_DELAY = int(os.getenv('CRAWLER_DELAY', 1))
+
+# Selenium设置
+SELENIUM_HEADLESS = os.getenv('SELENIUM_HEADLESS', 'True').lower() == 'true'
+CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH', '/usr/bin/chromedriver')
+
+# Sentry监控（可选）
+SENTRY_DSN = os.getenv('SENTRY_DSN')
+if SENTRY_DSN:
+    import sentry_sdk
+    from sentry_sdk.integrations.django import DjangoIntegration
+
+    sentry_sdk.init(
+        dsn=SENTRY_DSN,
+        integrations=[DjangoIntegration()],
+        traces_sample_rate=1.0,
+        send_default_pii=True
+    )
+
+# Django REST Framework 配置
+REST_FRAMEWORK = {
+    'DEFAULT_RENDERER_CLASSES': [
+        'rest_framework.renderers.JSONRenderer',
+        'rest_framework.renderers.BrowsableAPIRenderer',
+    ],
+    'DEFAULT_PERMISSION_CLASSES': [
+        'rest_framework.permissions.IsAuthenticated',
+    ],
+    'DEFAULT_AUTHENTICATION_CLASSES': [
+        'rest_framework.authentication.SessionAuthentication',
+        'rest_framework.authentication.TokenAuthentication',
+    ],
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,31 +1,80 @@
+amqp==5.3.1
 asgiref==3.9.1
 asttokens==3.0.0
+attrs==25.3.0
 beautifulsoup4==4.13.4
+billiard==4.2.1
 bs4==0.0.2
+celery==5.5.3
 certifi==2025.8.3
 charset-normalizer==3.4.3
+click==8.2.1
+click-didyoumean==0.3.1
+click-plugins==1.1.1.2
+click-repl==0.3.0
+coverage==7.10.3
+cron-descriptor==1.4.5
 decorator==5.2.1
 Django==5.1
+django-celery-beat==2.8.1
+django-db-connection-pool==1.2.6
+django-timezone-field==7.1
+django_celery_results==2.6.0
+djangorestframework==3.16.1
 executing==2.2.0
+factory_boy==3.3.3
+Faker==37.5.3
+h11==0.16.0
 idna==3.10
+iniconfig==2.1.0
 ipython==9.4.0
 ipython_pygments_lexers==1.1.1
 jedi==0.19.2
+kombu==5.5.4
 lxml==6.0.0
+m3u8==6.0.0
 matplotlib-inline==0.1.7
+outcome==1.3.0.post0
+packaging==25.0
 parso==0.8.4
 pexpect==4.9.0
+pluggy==1.6.0
 prompt_toolkit==3.0.51
+psycopg2-binary==2.9.10
 ptyprocess==0.7.0
 pure_eval==0.2.3
+pycryptodome==3.23.0
 Pygments==2.19.2
+PySocks==1.7.1
+pytest==8.4.1
+pytest-cov==6.2.1
+pytest-django==4.11.1
+python-crontab==3.3.0
+python-dateutil==2.9.0.post0
 python-docx==1.2.0
+python-dotenv==1.1.1
+redis==6.4.0
 requests==2.32.4
+selenium==4.34.2
+sentry-sdk==2.35.0
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
 soupsieve==2.7
+SQLAlchemy==2.0.43
+sqlparams==6.2.0
 sqlparse==0.5.3
 stack-data==0.6.3
+tqdm==4.67.1
 traitlets==5.14.3
+trio==0.30.0
+trio-websocket==0.12.2
 typing_extensions==4.14.1
+tzdata==2025.2
 urllib3==2.5.0
 uv==0.8.8
+vine==5.1.0
 wcwidth==0.2.13
+webdriver-manager==4.0.2
+websocket-client==1.8.0
+wsproto==1.2.0