diff --git a/core/admin.py b/core/admin.py index aa37419..39e48b0 100644 --- a/core/admin.py +++ b/core/admin.py @@ -1,11 +1,234 @@ from django.contrib import admin +from django.contrib.admin import AdminSite from .models import Website, Article +# 添加actions相关的导入 +from django.contrib import messages +from django.http import HttpResponseRedirect +# 添加导出功能所需导入 +import csv +from django.http import HttpResponse +import json + +# 创建自定义管理站点 +class NewsCnAdminSite(AdminSite): + site_header = "新华网管理后台" + site_title = "新华网管理" + index_title = "新华网内容管理" + +class DongfangyancaoAdminSite(AdminSite): + site_header = "东方烟草报管理后台" + site_title = "东方烟草报管理" + index_title = "东方烟草报内容管理" + +# 实例化管理站点 +news_cn_admin = NewsCnAdminSite(name='news_cn_admin') +dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin') @admin.register(Website) class WebsiteAdmin(admin.ModelAdmin): list_display = ('name', 'base_url', 'enabled') +# 为ArticleAdmin添加自定义动作 @admin.register(Article) class ArticleAdmin(admin.ModelAdmin): list_display = ('title', 'website', 'pub_date') search_fields = ('title', 'content') + # 添加动作选项 + actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json'] + + def delete_dongfangyancao_articles(self, request, queryset): + """一键删除东方烟草报的所有文章""" + # 获取东方烟草报网站对象 + try: + dongfangyancao_website = Website.objects.get(name='东方烟草报') + # 删除所有东方烟草报的文章 + deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0] + self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS) + except Website.DoesNotExist: + self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR) + + # 设置动作的显示名称 + delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章" + + def export_as_csv(self, request, queryset): + """导出选中的文章为CSV格式""" + meta = self.model._meta + field_names = [field.name for field in meta.fields] + + response = HttpResponse(content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta) + writer = csv.writer(response) + + writer.writerow(field_names) + for obj in queryset: + row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in field_names] + writer.writerow(row) + + return response + + export_as_csv.short_description = "导出选中文章为CSV格式" + + def export_as_json(self, request, queryset): + """导出选中的文章为JSON格式""" + response = HttpResponse(content_type='application/json') + response['Content-Disposition'] = 'attachment; filename=articles.json' + + # 构造要导出的数据 + articles_data = [] + for article in queryset: + articles_data.append({ + 'id': article.id, + 'title': article.title, + 'website': article.website.name, + 'url': article.url, + 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, + 'content': article.content, + 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), + 'media_files': article.media_files + }) + + # 写入JSON数据 + response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) + return response + + export_as_json.short_description = "导出选中文章为JSON格式" + +# 为不同网站创建专门的文章管理类 +class NewsCnArticleAdmin(admin.ModelAdmin): + list_display = ('title', 'pub_date') + search_fields = ('title', 'content') + list_filter = ('pub_date',) + actions = ['export_as_csv', 'export_as_json'] + + def get_queryset(self, request): + qs = super().get_queryset(request) + # 只显示新华网的文章 + return qs.filter(website__name='www.news.cn') + + def export_as_csv(self, request, queryset): + """导出选中的文章为CSV格式""" + meta = self.model._meta + field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小 + + response = HttpResponse(content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv' + writer = csv.writer(response) + + writer.writerow(field_names) + for obj in queryset: + row = [] + for field in field_names: + value = getattr(obj, field) + if callable(value): + value = value() + if field == 'website': + value = value.name + row.append(value) + writer.writerow(row) + + return response + + export_as_csv.short_description = "导出选中文章为CSV格式" + + def export_as_json(self, request, queryset): + """导出选中的文章为JSON格式""" + response = HttpResponse(content_type='application/json') + response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json' + + # 构造要导出的数据 + articles_data = [] + for article in queryset: + articles_data.append({ + 'id': article.id, + 'title': article.title, + 'website': article.website.name, + 'url': article.url, + 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, + 'content': article.content, + 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), + 'media_files': article.media_files + }) + + # 写入JSON数据 + response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) + return response + + export_as_json.short_description = "导出选中文章为JSON格式" + +class DongfangyancaoArticleAdmin(admin.ModelAdmin): + list_display = ('title', 'pub_date') + search_fields = ('title', 'content') + list_filter = ('pub_date',) + # 添加动作选项 + actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json'] + + def get_queryset(self, request): + qs = super().get_queryset(request) + # 只显示东方烟草报的文章 + return qs.filter(website__name='东方烟草报') + + def delete_all_articles(self, request, queryset): + """删除当前筛选的所有文章(东方烟草报的所有文章)""" + # 删除所有东方烟草报的文章 + deleted_count = self.get_queryset(request).delete()[0] + self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS) + + # 设置动作的显示名称 + delete_all_articles.short_description = "删除所有当前筛选的文章" + + def export_as_csv(self, request, queryset): + """导出选中的文章为CSV格式""" + meta = self.model._meta + field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小 + + response = HttpResponse(content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv' + writer = csv.writer(response) + + writer.writerow(field_names) + for obj in queryset: + row = [] + for field in field_names: + value = getattr(obj, field) + if callable(value): + value = value() + if field == 'website': + value = value.name + row.append(value) + writer.writerow(row) + + return response + + export_as_csv.short_description = "导出选中文章为CSV格式" + + def export_as_json(self, request, queryset): + """导出选中的文章为JSON格式""" + response = HttpResponse(content_type='application/json') + response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json' + + # 构造要导出的数据 + articles_data = [] + for article in queryset: + articles_data.append({ + 'id': article.id, + 'title': article.title, + 'website': article.website.name, + 'url': article.url, + 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, + 'content': article.content, + 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), + 'media_files': article.media_files + }) + + # 写入JSON数据 + response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) + return response + + export_as_json.short_description = "导出选中文章为JSON格式" + +# 在各自的管理站点中注册模型 +news_cn_admin.register(Website, WebsiteAdmin) +news_cn_admin.register(Article, NewsCnArticleAdmin) + +dongfangyancao_admin.register(Website, WebsiteAdmin) +dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin) \ No newline at end of file diff --git a/core/management/commands/crawl_full_site.py b/core/management/commands/crawl_dongfangyancao.py similarity index 66% rename from core/management/commands/crawl_full_site.py rename to core/management/commands/crawl_dongfangyancao.py index 1c25982..2cb43e9 100644 --- a/core/management/commands/crawl_full_site.py +++ b/core/management/commands/crawl_dongfangyancao.py @@ -1,20 +1,21 @@ -# core/management/commands/crawl_full_site.py +# core/management/commands/crawl_dongfangyancao.py from django.core.management.base import BaseCommand from core.models import Website from core.utils import full_site_crawler + class Command(BaseCommand): - help = "全站递归爬取 www.news.cn" + help = "全站递归爬取 东方烟草报" def handle(self, *args, **kwargs): website, created = Website.objects.get_or_create( - name="www.news.cn", + name="东方烟草报", defaults={ - 'article_list_url': 'https://www.news.cn/', + 'article_list_url': 'https://www.eastobacco.com/', 'article_selector': 'a' } ) - start_url = "https://www.news.cn/" + start_url = "https://www.eastobacco.com/" self.stdout.write(f"开始全站爬取: {start_url}") full_site_crawler(start_url, website, max_pages=500) self.stdout.write("爬取完成") \ No newline at end of file diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index 559f77f..4dc2d9b 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -1,18 +1,21 @@ +# core/management/commands/crawl_xinhua.py from django.core.management.base import BaseCommand from core.models import Website -from core.utils import crawl_xinhua_list +from core.utils import full_site_crawler + class Command(BaseCommand): - help = '批量爬取新华网文章' + help = "全站递归爬取 www.news.cn" - def handle(self, *args, **options): - list_url = "https://www.news.cn/legal/index.html" - try: - website = Website.objects.get(base_url="https://www.news.cn/") - except Website.DoesNotExist: - self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) - return - - self.stdout.write(f"开始爬取文章列表页: {list_url}") - crawl_xinhua_list(list_url, website) - self.stdout.write(self.style.SUCCESS("批量爬取完成")) + def handle(self, *args, **kwargs): + website, created = Website.objects.get_or_create( + name="www.news.cn", + defaults={ + 'article_list_url': 'https://www.news.cn/', + 'article_selector': 'a' + } + ) + start_url = "https://www.news.cn/" + self.stdout.write(f"开始全站爬取: {start_url}") + full_site_crawler(start_url, website, max_pages=500) + self.stdout.write("爬取完成") diff --git a/core/management/commands/crawl_xinhua_bak.py b/core/management/commands/crawl_xinhua_bak.py new file mode 100644 index 0000000..99aaa98 --- /dev/null +++ b/core/management/commands/crawl_xinhua_bak.py @@ -0,0 +1,21 @@ +from django.core.management.base import BaseCommand +from core.models import Website +from core.utils import crawl_xinhua_list + +class Command(BaseCommand): + help = '批量爬取新华网文章' + + def handle(self, *args, **options): + # 添加使用标记,确认该命令是否被调用 + self.stdout.write(self.style.WARNING("crawl_xinhua command is being used")) + + list_url = "https://www.news.cn/legal/index.html" + try: + website = Website.objects.get(base_url="https://www.news.cn/") + except Website.DoesNotExist: + self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) + return + + self.stdout.write(f"开始爬取文章列表页: {list_url}") + crawl_xinhua_list(list_url, website) + self.stdout.write(self.style.SUCCESS("批量爬取完成")) \ No newline at end of file diff --git a/core/management/commands/export_articles.py b/core/management/commands/export_articles.py new file mode 100644 index 0000000..5623b78 --- /dev/null +++ b/core/management/commands/export_articles.py @@ -0,0 +1,130 @@ +from django.core.management.base import BaseCommand +from core.models import Article, Website +import json +import csv +import os +from django.conf import settings +from django.core.files.storage import default_storage +import zipfile +from django.utils import timezone + + +class Command(BaseCommand): + help = '导出文章及相关的媒体文件(图片、视频等)' + + def add_arguments(self, parser): + parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv') + parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章') + parser.add_argument('--output', type=str, default='', help='输出文件路径') + parser.add_argument('--include-media', action='store_true', help='包含媒体文件') + + def handle(self, *args, **options): + format_type = options['format'].lower() + website_name = options['website'] + output_path = options['output'] + include_media = options['include_media'] + + # 获取文章查询集 + articles = Article.objects.all() + if website_name: + try: + website = Website.objects.get(name=website_name) + articles = articles.filter(website=website) + except Website.DoesNotExist: + self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在')) + return + + if not articles.exists(): + self.stdout.write(self.style.WARNING('没有找到文章')) + return + + # 准备导出数据 + articles_data = [] + media_files = [] + + for article in articles: + article_data = { + 'id': article.id, + 'title': article.title, + 'website': article.website.name, + 'url': article.url, + 'pub_date': article.pub_date.isoformat() if article.pub_date else None, + 'content': article.content, + 'created_at': article.created_at.isoformat(), + 'media_files': article.media_files + } + articles_data.append(article_data) + + # 收集媒体文件路径 + if include_media: + for media_path in article.media_files: + full_path = os.path.join(settings.MEDIA_ROOT, media_path) + if os.path.exists(full_path): + media_files.append(full_path) + + # 确定输出路径 + if not output_path: + timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') + if include_media: + output_path = f'articles_export_{timestamp}.zip' + else: + output_path = f'articles_export_{timestamp}.{format_type}' + + # 执行导出 + if include_media: + self.export_with_media(articles_data, media_files, output_path, format_type) + else: + if format_type == 'json': + self.export_as_json(articles_data, output_path) + elif format_type == 'csv': + self.export_as_csv(articles_data, output_path) + else: + self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json 或 csv')) + return + + self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}')) + + def export_as_json(self, articles_data, output_path): + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(articles_data, f, ensure_ascii=False, indent=2) + + def export_as_csv(self, articles_data, output_path): + if not articles_data: + return + + # 打开CSV文件 + with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + for article_data in articles_data: + # 将列表转换为字符串以便在CSV中存储 + article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else '' + writer.writerow(article_data) + + def export_with_media(self, articles_data, media_files, output_path, format_type): + # 创建ZIP文件 + with zipfile.ZipFile(output_path, 'w') as zipf: + # 添加文章数据文件 + data_filename = f'articles.{format_type}' + if format_type == 'json': + json_data = json.dumps(articles_data, ensure_ascii=False, indent=2) + zipf.writestr(data_filename, json_data) + elif format_type == 'csv': + # 创建CSV内容 + if articles_data: + import io + csv_buffer = io.StringIO() + fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files'] + writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames) + writer.writeheader() + for article_data in articles_data: + article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else '' + writer.writerow(article_data) + zipf.writestr(data_filename, csv_buffer.getvalue()) + + # 添加媒体文件 + for media_path in media_files: + arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT)) + zipf.write(media_path, arcname) \ No newline at end of file diff --git a/core/models.py b/core/models.py index a1a1806..a176335 100644 --- a/core/models.py +++ b/core/models.py @@ -1,5 +1,6 @@ from django.db import models + class Website(models.Model): name = models.CharField(max_length=100, unique=True) base_url = models.URLField() diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index ffb2649..f0aee66 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -1,17 +1,77 @@
- +发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}
-