Add Search button

Support export for Word
Add packages
2025-08-11 23:42:14 +08:00 · 2025-08-11 23:14:56 +08:00 · 2025-08-11 22:55:57 +08:00 · 2025-08-11 22:20:19 +08:00 · 2025-08-11 14:33:32 +08:00
12 changed files with 1220 additions and 83 deletions
--- a/core/admin.py
+++ b/core/admin.py
@@ -1,11 +1,349 @@
 from django.contrib import admin
 from django.contrib.admin import AdminSite
 from .models import Website, Article
 # 添加actions相关的导入
 from django.contrib import messages
 from django.http import HttpResponseRedirect
 # 添加导出功能所需导入
 import csv
 from django.http import HttpResponse
 import json
 # 创建自定义管理站点
 class NewsCnAdminSite(AdminSite):
    site_header = "新华网管理后台"
    site_title = "新华网管理"
    index_title = "新华网内容管理"
 class DongfangyancaoAdminSite(AdminSite):
    site_header = "东方烟草报管理后台"
    site_title = "东方烟草报管理"
    index_title = "东方烟草报内容管理"
 # 实例化管理站点
 news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
 dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
@admin.register(Website)
 class WebsiteAdmin(admin.ModelAdmin):
    list_display = ('name', 'base_url', 'enabled')
 # 为ArticleAdmin添加自定义动作
@admin.register(Article)
 class ArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'website', 'pub_date')
    search_fields = ('title', 'content')
    # 添加动作选项
    actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
               'export_as_word']
    def delete_dongfangyancao_articles(self, request, queryset):
        """一键删除东方烟草报的所有文章"""
        # 获取东方烟草报网站对象
        try:
            dongfangyancao_website = Website.objects.get(name='东方烟草报')
            # 删除所有东方烟草报的文章
            deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
            self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
        except Website.DoesNotExist:
            self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
    # 设置动作的显示名称
    delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields]
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
                   field_names]
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
    def export_as_word(self, request, queryset):
        """导出选中的文章为Word格式"""
        try:
            from docx import Document
            from io import BytesIO
            from docx.shared import Inches
        except ImportError:
            self.message_user(request, "缺少python-docx库，请安装: pip install python-docx", messages.ERROR)
            return
        # 创建Word文档
        doc = Document()
        doc.add_heading('文章导出', 0)
        for article in queryset:
            # 添加文章标题
            doc.add_heading(article.title, level=1)
            # 添加文章元数据
            doc.add_paragraph(f"网站: {article.website.name}")
            doc.add_paragraph(f"URL: {article.url}")
            doc.add_paragraph(
                f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
            doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
            # 添加文章内容
            doc.add_heading('内容', level=2)
            # 简单处理HTML内容，移除标签并处理图片
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(article.content, 'html.parser')
            # 处理内容中的图片
            for img in soup.find_all('img'):
                src = img.get('src', '')
                if src:
                    # 尝试添加图片到文档
                    try:
                        import os
                        from django.conf import settings
                        import requests
                        from io import BytesIO
                        # 构建完整的图片路径
                        if src.startswith('http'):
                            # 网络图片
                            response = requests.get(src, timeout=10)
                            image_stream = BytesIO(response.content)
                            doc.add_picture(image_stream, width=Inches(4.0))
                        else:
                            # 本地图片
                            full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
                            if os.path.exists(full_path):
                                doc.add_picture(full_path, width=Inches(4.0))
                    except Exception as e:
                        # 如果添加图片失败，添加图片URL作为文本
                        doc.add_paragraph(f"[图片: {src}]")
                # 移除原始img标签
                img.decompose()
            content_text = soup.get_text()
            doc.add_paragraph(content_text)
            # 添加媒体文件信息
            if article.media_files:
                doc.add_heading('媒体文件', level=2)
                for media_file in article.media_files:
                    try:
                        import os
                        from django.conf import settings
                        from io import BytesIO
                        import requests
                        full_path = os.path.join(settings.MEDIA_ROOT, media_file)
                        if os.path.exists(full_path):
                            # 添加图片到文档
                            doc.add_picture(full_path, width=Inches(4.0))
                        else:
                            # 如果是URL格式的媒体文件
                            if media_file.startswith('http'):
                                response = requests.get(media_file, timeout=10)
                                image_stream = BytesIO(response.content)
                                doc.add_picture(image_stream, width=Inches(4.0))
                            else:
                                doc.add_paragraph(media_file)
                    except Exception as e:
                        doc.add_paragraph(media_file)
            # 添加分页符
            doc.add_page_break()
        # 保存到内存
        buffer = BytesIO()
        doc.save(buffer)
        buffer.seek(0)
        # 创建HttpResponse
        from django.http import HttpResponse
        response = HttpResponse(buffer.getvalue(),
                                content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
        response['Content-Disposition'] = 'attachment; filename=articles.docx'
        return response
    export_as_word.short_description = "导出选中文章为Word格式"
 # 为不同网站创建专门的文章管理类
 class NewsCnArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'pub_date')
    search_fields = ('title', 'content')
    list_filter = ('pub_date',)
    actions = ['export_as_csv', 'export_as_json']
    def get_queryset(self, request):
        qs = super().get_queryset(request)
        # 只显示新华网的文章
        return qs.filter(website__name='www.news.cn')
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = []
            for field in field_names:
                value = getattr(obj, field)
                if callable(value):
                    value = value()
                if field == 'website':
                    value = value.name
                row.append(value)
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
 class DongfangyancaoArticleAdmin(admin.ModelAdmin):
    list_display = ('title', 'pub_date')
    search_fields = ('title', 'content')
    list_filter = ('pub_date',)
    # 添加动作选项
    actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
    def get_queryset(self, request):
        qs = super().get_queryset(request)
        # 只显示东方烟草报的文章
        return qs.filter(website__name='东方烟草报')
    def delete_all_articles(self, request, queryset):
        """删除当前筛选的所有文章（东方烟草报的所有文章）"""
        # 删除所有东方烟草报的文章
        deleted_count = self.get_queryset(request).delete()[0]
        self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
    # 设置动作的显示名称
    delete_all_articles.short_description = "删除所有当前筛选的文章"
    def export_as_csv(self, request, queryset):
        """导出选中的文章为CSV格式"""
        meta = self.model._meta
        field_names = [field.name for field in meta.fields if field.name != 'content']  # 排除content字段以减小CSV大小
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
        writer = csv.writer(response)
        writer.writerow(field_names)
        for obj in queryset:
            row = []
            for field in field_names:
                value = getattr(obj, field)
                if callable(value):
                    value = value()
                if field == 'website':
                    value = value.name
                row.append(value)
            writer.writerow(row)
        return response
    export_as_csv.short_description = "导出选中文章为CSV格式"
    def export_as_json(self, request, queryset):
        """导出选中的文章为JSON格式"""
        response = HttpResponse(content_type='application/json')
        response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
        # 构造要导出的数据
        articles_data = []
        for article in queryset:
            articles_data.append({
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
                'media_files': article.media_files
            })
        # 写入JSON数据
        response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
        return response
    export_as_json.short_description = "导出选中文章为JSON格式"
 # 在各自的管理站点中注册模型
 news_cn_admin.register(Website, WebsiteAdmin)
 news_cn_admin.register(Article, NewsCnArticleAdmin)
 dongfangyancao_admin.register(Website, WebsiteAdmin)
 dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)
--- a/core/management/commands/crawl_dongfangyancao.py
+++ b/core/management/commands/crawl_dongfangyancao.py
@@ -0,0 +1,20 @@
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import full_site_crawler
 class Command(BaseCommand):
    help = "全站递归爬取 东方烟草报"
    def handle(self, *args, **kwargs):
        website, created = Website.objects.get_or_create(
            name="东方烟草报",
            defaults={
                'article_list_url': 'https://www.eastobacco.com/',
                'article_selector': 'a'
            }
        )
        start_url = "https://www.eastobacco.com/"
        self.stdout.write(f"开始全站爬取: {start_url}")
        full_site_crawler(start_url, website, max_pages=500)
        self.stdout.write("爬取完成")
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -1,18 +1,20 @@
 from django.core.management.base import BaseCommand
 from core.models import Website
-from core.utils import crawl_xinhua_list
+from core.utils import full_site_crawler
 class Command(BaseCommand):
-    help = '批量爬取新华网文章'
+    help = "全站递归爬取 www.news.cn"
-    def handle(self, *args, **options):
+    def handle(self, *args, **kwargs):
-        list_url = "https://www.news.cn/legal/index.html"
+        website, created = Website.objects.get_or_create(
-        try:
+            name="www.news.cn",
-            website = Website.objects.get(base_url="https://www.news.cn/")
+            defaults={
-        except Website.DoesNotExist:
+                'article_list_url': 'https://www.news.cn/',
-            self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在，请先后台添加"))
+                'article_selector': 'a'
-            return
+            }
-
+        )
-        self.stdout.write(f"开始爬取文章列表页: {list_url}")
+        start_url = "https://www.news.cn/"
-        crawl_xinhua_list(list_url, website)
+        self.stdout.write(f"开始全站爬取: {start_url}")
-        self.stdout.write(self.style.SUCCESS("批量爬取完成"))
+        full_site_crawler(start_url, website, max_pages=500)
        self.stdout.write("爬取完成")
--- a/core/management/commands/crawl_xinhua_bak.py
+++ b/core/management/commands/crawl_xinhua_bak.py
@@ -0,0 +1,21 @@
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import crawl_xinhua_list
 class Command(BaseCommand):
    help = '批量爬取新华网文章'
    def handle(self, *args, **options):
        # 添加使用标记，确认该命令是否被调用
        self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
        list_url = "https://www.news.cn/legal/index.html"
        try:
            website = Website.objects.get(base_url="https://www.news.cn/")
        except Website.DoesNotExist:
            self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在，请先后台添加"))
            return
        self.stdout.write(f"开始爬取文章列表页: {list_url}")
        crawl_xinhua_list(list_url, website)
        self.stdout.write(self.style.SUCCESS("批量爬取完成"))
--- a/core/management/commands/export_articles.py
+++ b/core/management/commands/export_articles.py
@@ -0,0 +1,311 @@
 from django.core.management.base import BaseCommand
 from core.models import Article, Website
 import json
 import csv
 import os
 from django.conf import settings
 from django.core.files.storage import default_storage
 import zipfile
 from django.utils import timezone
 class Command(BaseCommand):
    help = '导出文章及相关的媒体文件（图片、视频等）'
    def add_arguments(self, parser):
        parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
        parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
        parser.add_argument('--output', type=str, default='', help='输出文件路径')
        parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
    def handle(self, *args, **options):
        format_type = options['format'].lower()
        website_name = options['website']
        output_path = options['output']
        include_media = options['include_media']
        # 获取文章查询集
        articles = Article.objects.all()
        if website_name:
            try:
                website = Website.objects.get(name=website_name)
                articles = articles.filter(website=website)
            except Website.DoesNotExist:
                self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
                return
        if not articles.exists():
            self.stdout.write(self.style.WARNING('没有找到文章'))
            return
        # 准备导出数据
        articles_data = []
        media_files = []
        for article in articles:
            article_data = {
                'id': article.id,
                'title': article.title,
                'website': article.website.name,
                'url': article.url,
                'pub_date': article.pub_date.isoformat() if article.pub_date else None,
                'content': article.content,
                'created_at': article.created_at.isoformat(),
                'media_files': article.media_files
            }
            articles_data.append(article_data)
            # 收集媒体文件路径
            if include_media:
                for media_path in article.media_files:
                    full_path = os.path.join(settings.MEDIA_ROOT, media_path)
                    if os.path.exists(full_path):
                        media_files.append(full_path)
        # 确定输出路径
        if not output_path:
            timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
            if include_media:
                output_path = f'articles_export_{timestamp}.zip'
            else:
                output_path = f'articles_export_{timestamp}.{format_type}'
        # 执行导出
        if include_media:
            self.export_with_media(articles_data, media_files, output_path, format_type)
        else:
            if format_type == 'json':
                self.export_as_json(articles_data, output_path)
            elif format_type == 'csv':
                self.export_as_csv(articles_data, output_path)
            # 添加Word格式导出支持
            elif format_type == 'docx':
                self.export_as_word(articles_data, output_path)
            else:
                self.stdout.write(self.style.ERROR('不支持的格式，仅支持 json、csv 或 docx'))
                return
        self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
    def export_as_json(self, articles_data, output_path):
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(articles_data, f, ensure_ascii=False, indent=2)
    def export_as_csv(self, articles_data, output_path):
        if not articles_data:
            return
        # 打开CSV文件
        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for article_data in articles_data:
                # 将列表转换为字符串以便在CSV中存储
                article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
                    'media_files'] else ''
                writer.writerow(article_data)
    # 添加Word格式导出方法
    def export_as_word(self, articles_data, output_path):
        try:
            from docx import Document
            from docx.shared import Inches
        except ImportError:
            self.stdout.write(self.style.ERROR('缺少python-docx库，请安装: pip install python-docx'))
            return
        # 创建Word文档
        doc = Document()
        doc.add_heading('文章导出', 0)
        for article_data in articles_data:
            # 添加文章标题
            doc.add_heading(article_data['title'], level=1)
            # 添加文章元数据
            doc.add_paragraph(f"网站: {article_data['website']}")
            doc.add_paragraph(f"URL: {article_data['url']}")
            doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
            doc.add_paragraph(f"创建时间: {article_data['created_at']}")
            # 添加文章内容
            doc.add_heading('内容', level=2)
            # 简单处理HTML内容，移除标签
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(article_data['content'], 'html.parser')
            # 处理内容中的图片
            for img in soup.find_all('img'):
                src = img.get('src', '')
                if src:
                    # 尝试添加图片到文档
                    try:
                        import os
                        from django.conf import settings
                        import requests
                        from io import BytesIO
                        # 构建完整的图片路径
                        if src.startswith('http'):
                            # 网络图片
                            response = requests.get(src, timeout=10)
                            image_stream = BytesIO(response.content)
                            doc.add_picture(image_stream, width=Inches(4.0))
                        else:
                            # 本地图片
                            full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
                            if os.path.exists(full_path):
                                doc.add_picture(full_path, width=Inches(4.0))
                    except Exception as e:
                        # 如果添加图片失败，添加图片URL作为文本
                        doc.add_paragraph(f"[图片: {src}]")
                # 移除原始img标签
                img.decompose()
            content_text = soup.get_text()
            doc.add_paragraph(content_text)
            # 添加媒体文件信息
            if article_data['media_files']:
                doc.add_heading('媒体文件', level=2)
                for media_file in article_data['media_files']:
                    try:
                        import os
                        from django.conf import settings
                        from io import BytesIO
                        import requests
                        full_path = os.path.join(settings.MEDIA_ROOT, media_file)
                        if os.path.exists(full_path):
                            # 添加图片到文档
                            doc.add_picture(full_path, width=Inches(4.0))
                        else:
                            # 如果是URL格式的媒体文件
                            if media_file.startswith('http'):
                                response = requests.get(media_file, timeout=10)
                                image_stream = BytesIO(response.content)
                                doc.add_picture(image_stream, width=Inches(4.0))
                            else:
                                doc.add_paragraph(media_file)
                    except Exception as e:
                        doc.add_paragraph(media_file)
            # 添加分页符
            doc.add_page_break()
        # 保存文档
        doc.save(output_path)
    def export_with_media(self, articles_data, media_files, output_path, format_type):
        # 创建ZIP文件
        with zipfile.ZipFile(output_path, 'w') as zipf:
            # 添加文章数据文件
            data_filename = f'articles.{format_type}'
            if format_type == 'json':
                json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
                zipf.writestr(data_filename, json_data)
            elif format_type == 'csv':
                # 创建CSV内容
                if articles_data:
                    import io
                    csv_buffer = io.StringIO()
                    fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
                    writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
                    writer.writeheader()
                    for article_data in articles_data:
                        article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
                            'media_files'] else ''
                        writer.writerow(article_data)
                    zipf.writestr(data_filename, csv_buffer.getvalue())
            # 添加Word格式支持
            elif format_type == 'docx':
                # 创建Word文档并保存到ZIP
                try:
                    from docx import Document
                    from docx.shared import Inches
                    from io import BytesIO
                    doc = Document()
                    doc.add_heading('文章导出', 0)
                    for article_data in articles_data:
                        doc.add_heading(article_data['title'], level=1)
                        doc.add_paragraph(f"网站: {article_data['website']}")
                        doc.add_paragraph(f"URL: {article_data['url']}")
                        doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
                        doc.add_paragraph(f"创建时间: {article_data['created_at']}")
                        doc.add_heading('内容', level=2)
                        from bs4 import BeautifulSoup
                        soup = BeautifulSoup(article_data['content'], 'html.parser')
                        # 处理内容中的图片
                        for img in soup.find_all('img'):
                            src = img.get('src', '')
                            if src:
                                # 尝试添加图片到文档
                                try:
                                    import os
                                    from django.conf import settings
                                    import requests
                                    # 构建完整的图片路径
                                    if src.startswith('http'):
                                        # 网络图片
                                        response = requests.get(src, timeout=10)
                                        image_stream = BytesIO(response.content)
                                        doc.add_picture(image_stream, width=Inches(4.0))
                                    else:
                                        # 本地图片
                                        full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
                                        if os.path.exists(full_path):
                                            doc.add_picture(full_path, width=Inches(4.0))
                                except Exception as e:
                                    # 如果添加图片失败，添加图片URL作为文本
                                    doc.add_paragraph(f"[图片: {src}]")
                            # 移除原始img标签
                            img.decompose()
                        content_text = soup.get_text()
                        doc.add_paragraph(content_text)
                        if article_data['media_files']:
                            doc.add_heading('媒体文件', level=2)
                            for media_file in article_data['media_files']:
                                try:
                                    import os
                                    from django.conf import settings
                                    full_path = os.path.join(settings.MEDIA_ROOT, media_file)
                                    if os.path.exists(full_path):
                                        # 添加图片到文档
                                        doc.add_picture(full_path, width=Inches(4.0))
                                    else:
                                        # 如果是URL格式的媒体文件
                                        if media_file.startswith('http'):
                                            response = requests.get(media_file, timeout=10)
                                            image_stream = BytesIO(response.content)
                                            doc.add_picture(image_stream, width=Inches(4.0))
                                        else:
                                            doc.add_paragraph(media_file)
                                except Exception as e:
                                    doc.add_paragraph(media_file)
                        doc.add_page_break()
                    # 将文档保存到内存中再写入ZIP
                    doc_buffer = BytesIO()
                    doc.save(doc_buffer)
                    doc_buffer.seek(0)
                    zipf.writestr(data_filename, doc_buffer.read())
                except ImportError:
                    zipf.writestr(data_filename, "错误：缺少python-docx库，无法生成Word文档")
            # 添加媒体文件
            for media_path in media_files:
                arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
                zipf.write(media_path, arcname)
--- a/core/models.py
+++ b/core/models.py
@@ -1,5 +1,6 @@
 from django.db import models
 class Website(models.Model):
    name = models.CharField(max_length=100, unique=True)
    base_url = models.URLField()
--- a/core/templates/core/article_detail.html
+++ b/core/templates/core/article_detail.html
@@ -1,17 +1,85 @@
 <!DOCTYPE html>
 <html lang="zh">
 <head>
-    <meta charset="UTF-8" />
+    <meta charset="UTF-8"/>
    <title>{{ article.title }}</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
        }
        .article-container {
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
            padding: 30px;
            margin-bottom: 20px;
        }
        h1 {
            color: #2c3e50;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
            margin-top: 0;
        }
        .meta {
            color: #7f8c8d;
            font-size: 0.9em;
            margin-bottom: 20px;
        }
        hr {
            border: 0;
            height: 1px;
            background: #ecf0f1;
            margin: 20px 0;
        }
        .content {
            font-size: 16px;
        }
        .content img {
            max-width: 100%;
            height: auto;
            border-radius: 4px;
            margin: 10px 0;
        }
        .back-link {
            display: inline-block;
            padding: 10px 20px;
            background-color: #3498db;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            transition: background-color 0.3s;
        }
        .back-link:hover {
            background-color: #2980b9;
        }
    </style>
 </head>
 <body>
 <div class="article-container">
    <h1>{{ article.title }}</h1>
-    <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
+    <div class="meta">
-    <hr />
+        <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
-    <div>
+    </div>
    <hr/>
    <div class="content">
        {{ article.content|safe }}
    </div>
-    <hr />
+    <hr/>
-    <p><a href="{% url 'article_list' %}">返回列表</a></p>
+    <p><a href="{% url 'article_list' %}" class="back-link">← 返回列表</a></p>
 </div>
 </body>
-</html>
+</html>
--- a/core/templates/core/article_list.html
+++ b/core/templates/core/article_list.html
@@ -1,33 +1,252 @@
 <!DOCTYPE html>
 <html lang="zh">
 <head>
-    <meta charset="UTF-8" />
+    <meta charset="UTF-8"/>
    <title>绿色课堂文章列表</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px; /* 修改:增加页面最大宽度 */
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
        }
        .container {
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
            padding: 30px;
            margin-bottom: 20px;
        }
        h1 {
            color: #2c3e50;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
            margin-top: 0;
        }
        .filters {
            margin-bottom: 20px;
            padding: 15px;
            background-color: #f1f8ff;
            border-radius: 5px;
        }
        .filters a {
            display: inline-block;
            padding: 5px 10px;
            margin: 0 5px 5px 0;
            background-color: #e1e8ed;
            color: #333;
            text-decoration: none;
            border-radius: 3px;
        }
        .filters a.active {
            background-color: #3498db;
            color: white;
        }
        ul {
            list-style: none;
            padding: 0;
        }
        li {
            padding: 10px 0;
            border-bottom: 1px solid #ecf0f1;
        }
        li:last-child {
            border-bottom: none;
        }
        a {
            color: #3498db;
            text-decoration: none;
        }
        a:hover {
            color: #2980b9;
            text-decoration: underline;
        }
        .meta {
            color: #7f8c8d;
            font-size: 0.9em;
        }
        .pagination {
            margin-top: 30px;
            text-align: center;
            padding: 20px 0;
        }
        .pagination a {
            display: inline-block;
            padding: 8px 16px;
            background-color: #3498db;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            margin: 0 2px; /* 修改:调整页码间距 */
        }
        .pagination a:hover {
            background-color: #2980b9;
        }
        .pagination span {
            margin: 0 10px;
            color: #7f8c8d;
        }
        /* 新增:当前页码样式 */
        .pagination .current {
            background-color: #2980b9;
            cursor: default;
        }
        /* 新增:省略号样式 */
        .pagination .ellipsis {
            display: inline-block;
            padding: 8px 4px;
            color: #7f8c8d;
        }
        /* 新增:搜索框样式 */
        .search-form {
            margin-bottom: 20px;
            padding: 15px;
            background-color: #f1f8ff;
            border-radius: 5px;
        }
        .search-form input[type="text"] {
            padding: 8px 12px;
            border: 1px solid #ddd;
            border-radius: 4px;
            width: 300px;
            margin-right: 10px;
        }
        .search-form input[type="submit"] {
            padding: 8px 16px;
            background-color: #3498db;
            color: white;
            border: none;
            border-radius: 4px;
            cursor: pointer;
        }
        .search-form input[type="submit"]:hover {
            background-color: #2980b9;
        }
        .search-info {
            color: #7f8c8d;
            font-size: 0.9em;
            margin-bottom: 10px;
        }
    </style>
 </head>
 <body>
 <div class="container">
    <h1>绿色课堂文章列表</h1>
    <!-- 新增:返回首页链接 -->
    <div style="margin-bottom: 20px;">
        <a href="{% url 'article_list' %}" style="color: #3498db; text-decoration: none;">&larr; 返回首页</a>
    </div>
    <!-- 新增:搜索表单 -->
    <div class="search-form">
        <form method="get">
            <input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
            {% if selected_website %}
            <input type="hidden" name="website" value="{{ selected_website.id }}">
            {% endif %}
            <input type="submit" value="搜索">
        </form>
    </div>
    <div class="filters">
        <strong>按网站筛选：</strong>
        <a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
        {% for website in websites %}
        <a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
        {% endfor %}
    </div>
    <!-- 新增:搜索结果信息 -->
    {% if search_query %}
    <div class="search-info">
        搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
        <a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
    </div>
    {% endif %}
    <ul>
        {% for article in page_obj %}
-            <li>
+        <li>
-                <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
+            <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
-                ({{ article.created_at|date:"Y-m-d" }})
+            <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
-            </li>
+        </li>
        {% empty %}
-            <li>暂无文章</li>
+        <li>暂无文章</li>
        {% endfor %}
    </ul>
    <div class="pagination">
        {% if page_obj.has_previous %}
-            <a href="?page={{ page_obj.previous_page_number }}">上一页</a>
+        {% if selected_website %}
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
        {% else %}
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
        {% endif %}
        {% endif %}
        <span>第 {{ page_obj.number }} 页，共 {{ page_obj.paginator.num_pages }} 页</span>
        <!-- 修改:优化页码显示逻辑 -->
        {% with page_obj.paginator as paginator %}
        {% for num in paginator.page_range %}
        {% if page_obj.number == num %}
        <a href="#" class="current">{{ num }}</a>
        {% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
        {% if selected_website %}
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
        {% else %}
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
        {% endif %}
        {% elif num == 1 or num == paginator.num_pages %}
        {% if selected_website %}
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
        {% else %}
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
        {% endif %}
        {% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
        <span class="ellipsis">...</span>
        {% endif %}
        {% endfor %}
        {% endwith %}
        {% if page_obj.has_next %}
-            <a href="?page={{ page_obj.next_page_number }}">下一页</a>
+        {% if selected_website %}
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
        <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
        {% else %}
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
        <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
        {% endif %}
        {% endif %}
    </div>
 </div>
 </body>
-</html>
+</html>
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,20 +1,50 @@
 # core/utils.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 from collections import deque
 from django.utils import timezone
 from django.conf import settings
 from core.models import Article
 import re
 def download_media(url, save_dir):
    try:
-        resp = requests.get(url, timeout=15)
+        # 添加请求头以避免403 Forbidden错误
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Referer": urljoin(url, "/")
        }
        resp = requests.get(url, timeout=15, headers=headers)
        resp.raise_for_status()
    except Exception as e:
        print(f"下载失败：{url}，错误：{e}")
        return None
-    filename = url.split("/")[-1].split("?")[0]
+    # 更安全地处理文件名，去除查询参数并处理特殊字符
    parsed_url = urlparse(url)
    filename = os.path.basename(parsed_url.path)
    if not filename or '.' not in filename:
        # 如果URL路径中没有有效的文件名，使用默认名称
        filename = 'media_file'
    # 清理文件名中的特殊字符
    filename = re.sub(r'[^\w\-_\.]', '_', filename)
    # 确保文件有扩展名
    if '.' not in filename:
        content_type = resp.headers.get('content-type', '')
        if 'image/jpeg' in content_type:
            filename += '.jpg'
        elif 'image/png' in content_type:
            filename += '.png'
        elif 'image/gif' in content_type:
            filename += '.gif'
        else:
            filename += '.bin'  # 默认二进制扩展名
    os.makedirs(save_dir, exist_ok=True)
    filepath = os.path.join(save_dir, filename)
@@ -27,22 +57,51 @@ def download_media(url, save_dir):
    with open(filepath, "wb") as f:
        f.write(resp.content)
    return filepath
    # 返回相对路径，方便存数据库和展示
    return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
-def crawl_xinhua_article(url, website):
+def process_article(url, website):
    if Article.objects.filter(url=url).exists():
        print(f"文章已存在，跳过: {url}")
        return
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, "html.parser")
-    title_tag = soup.find("span", class_="title")
+    # 处理不同网站的文章结构
    if website.name == "www.news.cn":
        title_tag = soup.find("span", class_="title")
        content_tag = soup.find("span", id="detailContent")
    elif website.name == "东方烟草报":
        # 优化东方烟草报的标题提取逻辑，按优先级尝试多种选择器
        title_tag = (
                soup.find("h1", id="title") or  # 特别针对带id="title"的h1标签
                soup.find("h1") or  # 主要标题标签
                soup.find("title") or  # 页面title标签
                soup.find("div", class_="title") or  # 某些页面可能使用div.title
                soup.find("h2")  # 备选标题标签
        )
        content_tag = soup.find("div", class_="content")  # 东方烟草报的内容通常在div.content中
        # 增加对另一种内容结构的支持
        if not content_tag:
            content_tag = soup.find("div", id="gallery")
        # 再增加对新内容结构的支持
        if not content_tag:
            content_tag = soup.find("div", id="ContentText")
    else:
        # 默认处理方式
        title_tag = soup.find("h1") or soup.find("title")
        content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
    title = title_tag.get_text(strip=True) if title_tag else "无标题"
-    content_tag = soup.find("span", id="detailContent")
+    # 对标题进行额外处理，去除可能的多余空白字符
    title = title.strip() if title else "无标题"
    if not content_tag:
-        print(f"没有找到正文，跳过文章: {url}")
+        print("没有找到正文，跳过:", url)
        return
    imgs = content_tag.find_all("img")
@@ -56,22 +115,16 @@ def crawl_xinhua_article(url, website):
        src = img.get("src")
        if not src:
            continue
        # 这里用文章URL作为基准拼接相对路径，避免错误
        if not src.startswith("http"):
            src = urljoin(url, src)
-
+        local_path = download_media(src, save_dir)
-        local_rel_path = download_media(src, save_dir)
+        if local_path:
-        if local_rel_path:
+            rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
-            img["src"] = settings.MEDIA_URL + local_rel_path
+            img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
-            media_files.append(local_rel_path)
+            media_files.append(rel_path.replace("\\", "/"))
    content_html = str(content_tag)
    if Article.objects.filter(url=url).exists():
        print(f"文章已存在，跳过: {url}")
        return
    article = Article.objects.create(
        website=website,
        title=title,
@@ -82,22 +135,74 @@ def crawl_xinhua_article(url, website):
    )
    print(f"已保存文章及图片：{title}")
-def crawl_xinhua_list(list_url, website):
+
 def is_valid_url(url, base_netloc):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in ("http", "https"):
            return False
        if parsed.netloc != base_netloc:
            return False
        return True
    except Exception:
        return False
 def full_site_crawler(start_url, website, max_pages=1000):
    headers = {"User-Agent": "Mozilla/5.0"}
-    resp = requests.get(list_url, headers=headers)
+    visited = set()
-    resp.encoding = 'utf-8'
+    queue = deque([start_url])
    soup = BeautifulSoup(resp.text, "html.parser")
-    article_urls = set()
+    base_netloc = urlparse(start_url).netloc
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
            article_urls.add(href)
-    print(f"在列表页找到 {len(article_urls)} 篇文章链接")
+    pages_crawled = 0
    for url in article_urls:
        print("文章链接:", url)
-    from core.utils import crawl_xinhua_article
+    while queue and pages_crawled < max_pages:
-    for article_url in article_urls:
+        url = queue.popleft()
-        crawl_xinhua_article(article_url, website)
+        if url in visited:
            continue
        print(f"正在爬取：{url}")
        visited.add(url)
        try:
            resp = requests.get(url, headers=headers, timeout=15)
            resp.raise_for_status()
        except Exception as e:
            print(f"请求失败：{url}，错误：{e}")
            continue
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")
        # 根据不同网站判断文章页面
        is_article_page = False
        if website.name == "www.news.cn":
            is_article_page = soup.find("span", id="detailContent") is not None
        elif website.name == "东方烟草报":
            # 对于东方烟草报，我们增加基于URL模式的判断
            # 东方烟草报的文章URL通常包含/content/和日期格式
            parsed_url = urlparse(url)
            path = parsed_url.path
            is_article_page = (
                    soup.find("div", class_="content") is not None or
                    soup.find("div", id="gallery") is not None or
                    soup.find("div", id="ContentText") is not None or
                    ("/content/" in path and len(path) > 20)
            )
        else:
            # 默认判断逻辑
            is_article_page = (
                    soup.find("div", class_="content") is not None or
                    soup.find("div", id="content") is not None
            )
        # 如果是文章页面，则调用文章处理
        if is_article_page:
            process_article(url, website)
            pages_crawled += 1
        # 扩展队列，发现新链接
        for link in soup.find_all("a", href=True):
            href = urljoin(url, link["href"])
            if href not in visited and is_valid_url(href, base_netloc):
                queue.append(href)
--- a/core/views.py
+++ b/core/views.py
@@ -1,28 +1,44 @@
-from django.shortcuts import render, get_object_or_404
+from django.shortcuts import render
 from django.core.paginator import Paginator
-from .models import Article
+from .models import Article, Website
 def article_list(request):
-    """
+    # 获取所有启用的网站
-    显示文章列表的视图函数
+    websites = Website.objects.filter(enabled=True)
    """
    articles = Article.objects.all().order_by('-created_at')
    paginator = Paginator(articles, 20)  # 每页显示10篇文章
    # 获取筛选网站
    selected_website = None
    articles = Article.objects.all()
    website_id = request.GET.get('website')
    if website_id:
        try:
            selected_website = Website.objects.get(id=website_id)
            articles = articles.filter(website=selected_website)
        except Website.DoesNotExist:
            pass
    # 新增:处理关键词搜索
    search_query = request.GET.get('q')
    if search_query:
        articles = articles.filter(title__icontains=search_query)
    # 按创建时间倒序排列
    articles = articles.order_by('-created_at')
    # 分页
    paginator = Paginator(articles, 10)  # 每页显示10篇文章
    page_number = request.GET.get('page')
    page_obj = paginator.get_page(page_number)
    return render(request, 'core/article_list.html', {
-        'page_obj': page_obj
+        'page_obj': page_obj,
        'websites': websites,
        'selected_website': selected_website,
        # 新增:传递搜索关键词到模板
        'search_query': search_query
    })
 def article_detail(request, article_id):
-    """
+    article = Article.objects.get(id=article_id)
-    显示文章详情的视图函数
+    return render(request, 'core/article_detail.html', {'article': article})
    """
    article = get_object_or_404(Article, id=article_id)
    return render(request, 'core/article_detail.html', {
        'article': article
    })
 # Create your views here.
--- a/green_classroom/urls.py
+++ b/green_classroom/urls.py
@@ -1,13 +1,18 @@
 from django.contrib import admin
 from django.urls import path, include
 from django.conf import settings
 from django.conf.urls.static import static
 from django.contrib import admin
 from django.urls import path, include
 # 需要导入自定义的管理站点实例
 from core.admin import news_cn_admin, dongfangyancao_admin
 urlpatterns = [
    path('admin/', admin.site.urls),
    path('news_cn_admin/', news_cn_admin.urls),
    path('dongfangyancao_admin/', dongfangyancao_admin.urls),
    # 以后前台访问放 core app 的 urls
    path('', include('core.urls')),
 ]
 if settings.DEBUG:
-    urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
+    urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,31 @@
 asgiref==3.9.1
 asttokens==3.0.0
 beautifulsoup4==4.13.4
 bs4==0.0.2
 certifi==2025.8.3
 charset-normalizer==3.4.3
 decorator==5.2.1
 Django==5.1
 executing==2.2.0
 idna==3.10
 ipython==9.4.0
 ipython_pygments_lexers==1.1.1
 jedi==0.19.2
 lxml==6.0.0
 matplotlib-inline==0.1.7
 parso==0.8.4
 pexpect==4.9.0
 prompt_toolkit==3.0.51
 ptyprocess==0.7.0
 pure_eval==0.2.3
 Pygments==2.19.2
 python-docx==1.2.0
 requests==2.32.4
 soupsieve==2.7
 sqlparse==0.5.3
 stack-data==0.6.3
 traitlets==5.14.3
 typing_extensions==4.14.1
 urllib3==2.5.0
 uv==0.8.8
 wcwidth==0.2.13
Author	SHA1	Message	Date
yuangyaa	958b087f54	Add Search button	2025-08-11 23:42:14 +08:00
yuangyaa	b6bbb90703	Support export for Word	2025-08-11 23:14:56 +08:00
yuangyaa	bfd1604872	Add packages	2025-08-11 22:55:57 +08:00
yuangyaa	d9d2ea9d99	Add Support dongfangyaocao	2025-08-11 22:20:19 +08:00
yuangyaa	6d80326a4e	Add Support full site	2025-08-11 14:33:32 +08:00