Add Search button
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
# core/management/commands/crawl_dongfangyancao.py
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
@@ -18,4 +17,4 @@ class Command(BaseCommand):
|
||||
start_url = "https://www.eastobacco.com/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
self.stdout.write("爬取完成")
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# core/management/commands/crawl_xinhua.py
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
@@ -23,7 +23,7 @@ class Command(BaseCommand):
|
||||
website_name = options['website']
|
||||
output_path = options['output']
|
||||
include_media = options['include_media']
|
||||
|
||||
|
||||
# 获取文章查询集
|
||||
articles = Article.objects.all()
|
||||
if website_name:
|
||||
@@ -33,15 +33,15 @@ class Command(BaseCommand):
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
|
||||
return
|
||||
|
||||
|
||||
if not articles.exists():
|
||||
self.stdout.write(self.style.WARNING('没有找到文章'))
|
||||
return
|
||||
|
||||
|
||||
# 准备导出数据
|
||||
articles_data = []
|
||||
media_files = []
|
||||
|
||||
|
||||
for article in articles:
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
@@ -54,14 +54,14 @@ class Command(BaseCommand):
|
||||
'media_files': article.media_files
|
||||
}
|
||||
articles_data.append(article_data)
|
||||
|
||||
|
||||
# 收集媒体文件路径
|
||||
if include_media:
|
||||
for media_path in article.media_files:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
|
||||
if os.path.exists(full_path):
|
||||
media_files.append(full_path)
|
||||
|
||||
|
||||
# 确定输出路径
|
||||
if not output_path:
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
@@ -69,7 +69,7 @@ class Command(BaseCommand):
|
||||
output_path = f'articles_export_{timestamp}.zip'
|
||||
else:
|
||||
output_path = f'articles_export_{timestamp}.{format_type}'
|
||||
|
||||
|
||||
# 执行导出
|
||||
if include_media:
|
||||
self.export_with_media(articles_data, media_files, output_path, format_type)
|
||||
@@ -84,7 +84,7 @@ class Command(BaseCommand):
|
||||
else:
|
||||
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json、csv 或 docx'))
|
||||
return
|
||||
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
|
||||
|
||||
def export_as_json(self, articles_data, output_path):
|
||||
@@ -94,16 +94,17 @@ class Command(BaseCommand):
|
||||
def export_as_csv(self, articles_data, output_path):
|
||||
if not articles_data:
|
||||
return
|
||||
|
||||
|
||||
# 打开CSV文件
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
# 将列表转换为字符串以便在CSV中存储
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
|
||||
# 添加Word格式导出方法
|
||||
@@ -122,19 +123,19 @@ class Command(BaseCommand):
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
@@ -145,7 +146,7 @@ class Command(BaseCommand):
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
@@ -160,13 +161,13 @@ class Command(BaseCommand):
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
@@ -176,7 +177,7 @@ class Command(BaseCommand):
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
@@ -191,10 +192,10 @@ class Command(BaseCommand):
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
|
||||
# 保存文档
|
||||
doc.save(output_path)
|
||||
|
||||
@@ -215,7 +216,8 @@ class Command(BaseCommand):
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
zipf.writestr(data_filename, csv_buffer.getvalue())
|
||||
# 添加Word格式支持
|
||||
@@ -225,7 +227,7 @@ class Command(BaseCommand):
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
@@ -235,11 +237,11 @@ class Command(BaseCommand):
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
|
||||
doc.add_heading('内容', level=2)
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
@@ -249,7 +251,7 @@ class Command(BaseCommand):
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
@@ -264,20 +266,20 @@ class Command(BaseCommand):
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
@@ -292,9 +294,9 @@ class Command(BaseCommand):
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
|
||||
# 将文档保存到内存中再写入ZIP
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
@@ -302,8 +304,8 @@ class Command(BaseCommand):
|
||||
zipf.writestr(data_filename, doc_buffer.read())
|
||||
except ImportError:
|
||||
zipf.writestr(data_filename, "错误:缺少python-docx库,无法生成Word文档")
|
||||
|
||||
|
||||
# 添加媒体文件
|
||||
for media_path in media_files:
|
||||
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
|
||||
zipf.write(media_path, arcname)
|
||||
zipf.write(media_path, arcname)
|
||||
|
||||
Reference in New Issue
Block a user