Add Search button

2025-08-11 23:42:14 +08:00
parent b6bbb90703
commit 958b087f54
8 changed files with 330 additions and 187 deletions
--- a/core/management/commands/crawl_dongfangyancao.py
+++ b/core/management/commands/crawl_dongfangyancao.py
@@ -1,4 +1,3 @@
-# core/management/commands/crawl_dongfangyancao.py
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import full_site_crawler
@@ -18,4 +17,4 @@ class Command(BaseCommand):
        start_url = "https://www.eastobacco.com/"
        self.stdout.write(f"开始全站爬取: {start_url}")
        full_site_crawler(start_url, website, max_pages=500)
-        self.stdout.write("爬取完成")
+        self.stdout.write("爬取完成")
--- a/core/management/commands/crawl_xinhua.py
+++ b/core/management/commands/crawl_xinhua.py
@@ -1,4 +1,3 @@
-# core/management/commands/crawl_xinhua.py
 from django.core.management.base import BaseCommand
 from core.models import Website
 from core.utils import full_site_crawler
--- a/core/management/commands/export_articles.py
+++ b/core/management/commands/export_articles.py
@@ -23,7 +23,7 @@ class Command(BaseCommand):
        website_name = options['website']
        output_path = options['output']
        include_media = options['include_media']
-        
+
        # 获取文章查询集
        articles = Article.objects.all()
        if website_name:
@@ -33,15 +33,15 @@ class Command(BaseCommand):
            except Website.DoesNotExist:
                self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
                return
-        
+
        if not articles.exists():
            self.stdout.write(self.style.WARNING('没有找到文章'))
            return
-        
+
        # 准备导出数据
        articles_data = []
        media_files = []
-        
+
        for article in articles:
            article_data = {
                'id': article.id,
@@ -54,14 +54,14 @@ class Command(BaseCommand):
                'media_files': article.media_files
            }
            articles_data.append(article_data)
-            
+
            # 收集媒体文件路径
            if include_media:
                for media_path in article.media_files:
                    full_path = os.path.join(settings.MEDIA_ROOT, media_path)
                    if os.path.exists(full_path):
                        media_files.append(full_path)
-        
+
        # 确定输出路径
        if not output_path:
            timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
@@ -69,7 +69,7 @@ class Command(BaseCommand):
                output_path = f'articles_export_{timestamp}.zip'
            else:
                output_path = f'articles_export_{timestamp}.{format_type}'
-        
+
        # 执行导出
        if include_media:
            self.export_with_media(articles_data, media_files, output_path, format_type)
@@ -84,7 +84,7 @@ class Command(BaseCommand):
            else:
                self.stdout.write(self.style.ERROR('不支持的格式，仅支持 json、csv 或 docx'))
                return
-        
+
        self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))

    def export_as_json(self, articles_data, output_path):
@@ -94,16 +94,17 @@ class Command(BaseCommand):
    def export_as_csv(self, articles_data, output_path):
        if not articles_data:
            return
-            
+
        # 打开CSV文件
        with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            
+
            writer.writeheader()
            for article_data in articles_data:
                # 将列表转换为字符串以便在CSV中存储
-                article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
+                article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
+                    'media_files'] else ''
                writer.writerow(article_data)

    # 添加Word格式导出方法
@@ -122,19 +123,19 @@ class Command(BaseCommand):
        for article_data in articles_data:
            # 添加文章标题
            doc.add_heading(article_data['title'], level=1)
-            
+
            # 添加文章元数据
            doc.add_paragraph(f"网站: {article_data['website']}")
            doc.add_paragraph(f"URL: {article_data['url']}")
            doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
            doc.add_paragraph(f"创建时间: {article_data['created_at']}")
-            
+
            # 添加文章内容
            doc.add_heading('内容', level=2)
            # 简单处理HTML内容，移除标签
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(article_data['content'], 'html.parser')
-            
+
            # 处理内容中的图片
            for img in soup.find_all('img'):
                src = img.get('src', '')
@@ -145,7 +146,7 @@ class Command(BaseCommand):
                        from django.conf import settings
                        import requests
                        from io import BytesIO
-                        
+
                        # 构建完整的图片路径
                        if src.startswith('http'):
                            # 网络图片
@@ -160,13 +161,13 @@ class Command(BaseCommand):
                    except Exception as e:
                        # 如果添加图片失败，添加图片URL作为文本
                        doc.add_paragraph(f"[图片: {src}]")
-                
+
                # 移除原始img标签
                img.decompose()
-            
+
            content_text = soup.get_text()
            doc.add_paragraph(content_text)
-            
+
            # 添加媒体文件信息
            if article_data['media_files']:
                doc.add_heading('媒体文件', level=2)
@@ -176,7 +177,7 @@ class Command(BaseCommand):
                        from django.conf import settings
                        from io import BytesIO
                        import requests
-                        
+
                        full_path = os.path.join(settings.MEDIA_ROOT, media_file)
                        if os.path.exists(full_path):
                            # 添加图片到文档
@@ -191,10 +192,10 @@ class Command(BaseCommand):
                                doc.add_paragraph(media_file)
                    except Exception as e:
                        doc.add_paragraph(media_file)
-            
+
            # 添加分页符
            doc.add_page_break()
-        
+
        # 保存文档
        doc.save(output_path)

@@ -215,7 +216,8 @@ class Command(BaseCommand):
                    writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
                    writer.writeheader()
                    for article_data in articles_data:
-                        article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
+                        article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
+                            'media_files'] else ''
                        writer.writerow(article_data)
                    zipf.writestr(data_filename, csv_buffer.getvalue())
            # 添加Word格式支持
@@ -225,7 +227,7 @@ class Command(BaseCommand):
                    from docx import Document
                    from docx.shared import Inches
                    from io import BytesIO
-                    
+
                    doc = Document()
                    doc.add_heading('文章导出', 0)

@@ -235,11 +237,11 @@ class Command(BaseCommand):
                        doc.add_paragraph(f"URL: {article_data['url']}")
                        doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
                        doc.add_paragraph(f"创建时间: {article_data['created_at']}")
-                        
+
                        doc.add_heading('内容', level=2)
                        from bs4 import BeautifulSoup
                        soup = BeautifulSoup(article_data['content'], 'html.parser')
-                        
+
                        # 处理内容中的图片
                        for img in soup.find_all('img'):
                            src = img.get('src', '')
@@ -249,7 +251,7 @@ class Command(BaseCommand):
                                    import os
                                    from django.conf import settings
                                    import requests
-                                    
+
                                    # 构建完整的图片路径
                                    if src.startswith('http'):
                                        # 网络图片
@@ -264,20 +266,20 @@ class Command(BaseCommand):
                                except Exception as e:
                                    # 如果添加图片失败，添加图片URL作为文本
                                    doc.add_paragraph(f"[图片: {src}]")
-                            
+
                            # 移除原始img标签
                            img.decompose()
-                        
+
                        content_text = soup.get_text()
                        doc.add_paragraph(content_text)
-                        
+
                        if article_data['media_files']:
                            doc.add_heading('媒体文件', level=2)
                            for media_file in article_data['media_files']:
                                try:
                                    import os
                                    from django.conf import settings
-                                    
+
                                    full_path = os.path.join(settings.MEDIA_ROOT, media_file)
                                    if os.path.exists(full_path):
                                        # 添加图片到文档
@@ -292,9 +294,9 @@ class Command(BaseCommand):
                                            doc.add_paragraph(media_file)
                                except Exception as e:
                                    doc.add_paragraph(media_file)
-                        
+
                        doc.add_page_break()
-                    
+
                    # 将文档保存到内存中再写入ZIP
                    doc_buffer = BytesIO()
                    doc.save(doc_buffer)
@@ -302,8 +304,8 @@ class Command(BaseCommand):
                    zipf.writestr(data_filename, doc_buffer.read())
                except ImportError:
                    zipf.writestr(data_filename, "错误：缺少python-docx库，无法生成Word文档")
-            
+
            # 添加媒体文件
            for media_path in media_files:
                arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
-                zipf.write(media_path, arcname)
+                zipf.write(media_path, arcname)