Add Support dongfangyaocao

This commit is contained in:
2025-08-11 22:20:19 +08:00
parent 6d80326a4e
commit d9d2ea9d99
11 changed files with 686 additions and 58 deletions

View File

@@ -1,11 +1,234 @@
from django.contrib import admin
from django.contrib.admin import AdminSite
from .models import Website, Article
# 添加actions相关的导入
from django.contrib import messages
from django.http import HttpResponseRedirect
# 添加导出功能所需导入
import csv
from django.http import HttpResponse
import json
# 创建自定义管理站点
class NewsCnAdminSite(AdminSite):
site_header = "新华网管理后台"
site_title = "新华网管理"
index_title = "新华网内容管理"
class DongfangyancaoAdminSite(AdminSite):
site_header = "东方烟草报管理后台"
site_title = "东方烟草报管理"
index_title = "东方烟草报内容管理"
# 实例化管理站点
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled')
# 为ArticleAdmin添加自定义动作
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content')
# 添加动作选项
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json']
def delete_dongfangyancao_articles(self, request, queryset):
"""一键删除东方烟草报的所有文章"""
# 获取东方烟草报网站对象
try:
dongfangyancao_website = Website.objects.get(name='东方烟草报')
# 删除所有东方烟草报的文章
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
except Website.DoesNotExist:
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
# 设置动作的显示名称
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields]
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in field_names]
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 为不同网站创建专门的文章管理类
class NewsCnArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
actions = ['export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示新华网的文章
return qs.filter(website__name='www.news.cn')
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
# 添加动作选项
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示东方烟草报的文章
return qs.filter(website__name='东方烟草报')
def delete_all_articles(self, request, queryset):
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
# 删除所有东方烟草报的文章
deleted_count = self.get_queryset(request).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
# 设置动作的显示名称
delete_all_articles.short_description = "删除所有当前筛选的文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 在各自的管理站点中注册模型
news_cn_admin.register(Website, WebsiteAdmin)
news_cn_admin.register(Article, NewsCnArticleAdmin)
dongfangyancao_admin.register(Website, WebsiteAdmin)
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)

View File

@@ -1,20 +1,21 @@
# core/management/commands/crawl_full_site.py
# core/management/commands/crawl_dongfangyancao.py
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 www.news.cn"
help = "全站递归爬取 东方烟草报"
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="www.news.cn",
name="东方烟草报",
defaults={
'article_list_url': 'https://www.news.cn/',
'article_list_url': 'https://www.eastobacco.com/',
'article_selector': 'a'
}
)
start_url = "https://www.news.cn/"
start_url = "https://www.eastobacco.com/"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -1,18 +1,21 @@
# core/management/commands/crawl_xinhua.py
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_list
from core.utils import full_site_crawler
class Command(BaseCommand):
help = '批量爬取新华网文章'
help = "全站递归爬取 www.news.cn"
def handle(self, *args, **options):
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="www.news.cn",
defaults={
'article_list_url': 'https://www.news.cn/',
'article_selector': 'a'
}
)
start_url = "https://www.news.cn/"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_list
class Command(BaseCommand):
help = '批量爬取新华网文章'
def handle(self, *args, **options):
# 添加使用标记,确认该命令是否被调用
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))

View File

@@ -0,0 +1,130 @@
from django.core.management.base import BaseCommand
from core.models import Article, Website
import json
import csv
import os
from django.conf import settings
from django.core.files.storage import default_storage
import zipfile
from django.utils import timezone
class Command(BaseCommand):
help = '导出文章及相关的媒体文件(图片、视频等)'
def add_arguments(self, parser):
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
parser.add_argument('--output', type=str, default='', help='输出文件路径')
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
def handle(self, *args, **options):
format_type = options['format'].lower()
website_name = options['website']
output_path = options['output']
include_media = options['include_media']
# 获取文章查询集
articles = Article.objects.all()
if website_name:
try:
website = Website.objects.get(name=website_name)
articles = articles.filter(website=website)
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
return
if not articles.exists():
self.stdout.write(self.style.WARNING('没有找到文章'))
return
# 准备导出数据
articles_data = []
media_files = []
for article in articles:
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.isoformat() if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.isoformat(),
'media_files': article.media_files
}
articles_data.append(article_data)
# 收集媒体文件路径
if include_media:
for media_path in article.media_files:
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
if os.path.exists(full_path):
media_files.append(full_path)
# 确定输出路径
if not output_path:
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
if include_media:
output_path = f'articles_export_{timestamp}.zip'
else:
output_path = f'articles_export_{timestamp}.{format_type}'
# 执行导出
if include_media:
self.export_with_media(articles_data, media_files, output_path, format_type)
else:
if format_type == 'json':
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
else:
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json 或 csv'))
return
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
def export_as_json(self, articles_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(articles_data, f, ensure_ascii=False, indent=2)
def export_as_csv(self, articles_data, output_path):
if not articles_data:
return
# 打开CSV文件
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
# 将列表转换为字符串以便在CSV中存储
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
# 添加文章数据文件
data_filename = f'articles.{format_type}'
if format_type == 'json':
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
zipf.writestr(data_filename, json_data)
elif format_type == 'csv':
# 创建CSV内容
if articles_data:
import io
csv_buffer = io.StringIO()
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
zipf.writestr(data_filename, csv_buffer.getvalue())
# 添加媒体文件
for media_path in media_files:
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
zipf.write(media_path, arcname)

View File

@@ -1,5 +1,6 @@
from django.db import models
class Website(models.Model):
name = models.CharField(max_length=100, unique=True)
base_url = models.URLField()

View File

@@ -1,17 +1,77 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<meta charset="UTF-8"/>
<title>{{ article.title }}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.article-container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
margin-bottom: 20px;
}
hr {
border: 0;
height: 1px;
background: #ecf0f1;
margin: 20px 0;
}
.content {
font-size: 16px;
}
.content img {
max-width: 100%;
height: auto;
border-radius: 4px;
margin: 10px 0;
}
.back-link {
display: inline-block;
padding: 10px 20px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
transition: background-color 0.3s;
}
.back-link:hover {
background-color: #2980b9;
}
</style>
</head>
<body>
<div class="article-container">
<h1>{{ article.title }}</h1>
<div class="meta">
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
<hr />
<div>
</div>
<hr/>
<div class="content">
{{ article.content|safe }}
</div>
<hr />
<p><a href="{% url 'article_list' %}">返回列表</a></p>
<hr/>
<p><a href="{% url 'article_list' %}" class="back-link">返回列表</a></p>
</div>
</body>
</html>

View File

@@ -1,17 +1,113 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<meta charset="UTF-8"/>
<title>绿色课堂文章列表</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.filters {
margin-bottom: 20px;
padding: 15px;
background-color: #f1f8ff;
border-radius: 5px;
}
.filters a {
display: inline-block;
padding: 5px 10px;
margin: 0 5px 5px 0;
background-color: #e1e8ed;
color: #333;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #ecf0f1;
}
li:last-child {
border-bottom: none;
}
a {
color: #3498db;
text-decoration: none;
}
a:hover {
color: #2980b9;
text-decoration: underline;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 5px;
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
</style>
</head>
<body>
<div class="container">
<h1>绿色课堂文章列表</h1>
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}" {% if not selected_website %}class="active"{% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}" {% if selected_website.id == website.id %}class="active"{% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
<ul>
{% for article in page_obj %}
<li>
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
({{ article.created_at|date:"Y-m-d" }})
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
@@ -20,14 +116,23 @@
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}&page={{ page_obj.next_page_number }}">下一页</a>
{% else %}
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
{% endif %}
{% endif %}
</div>
</div>
</body>
</html>

View File

@@ -7,16 +7,44 @@ from collections import deque
from django.utils import timezone
from django.conf import settings
from core.models import Article
import re
def download_media(url, save_dir):
try:
resp = requests.get(url, timeout=15)
# 添加请求头以避免403 Forbidden错误
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": urljoin(url, "/")
}
resp = requests.get(url, timeout=15, headers=headers)
resp.raise_for_status()
except Exception as e:
print(f"下载失败:{url},错误:{e}")
return None
filename = url.split("/")[-1].split("?")[0]
# 更安全地处理文件名,去除查询参数并处理特殊字符
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename or '.' not in filename:
# 如果URL路径中没有有效的文件名使用默认名称
filename = 'media_file'
# 清理文件名中的特殊字符
filename = re.sub(r'[^\w\-_\.]', '_', filename)
# 确保文件有扩展名
if '.' not in filename:
content_type = resp.headers.get('content-type', '')
if 'image/jpeg' in content_type:
filename += '.jpg'
elif 'image/png' in content_type:
filename += '.png'
elif 'image/gif' in content_type:
filename += '.gif'
else:
filename += '.bin' # 默认二进制扩展名
os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename)
@@ -31,6 +59,7 @@ def download_media(url, save_dir):
f.write(resp.content)
return filepath
def process_article(url, website):
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
@@ -41,10 +70,36 @@ def process_article(url, website):
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 处理不同网站的文章结构
if website.name == "www.news.cn":
title_tag = soup.find("span", class_="title")
content_tag = soup.find("span", id="detailContent")
elif website.name == "东方烟草报":
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
title_tag = (
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
soup.find("h1") or # 主要标题标签
soup.find("title") or # 页面title标签
soup.find("div", class_="title") or # 某些页面可能使用div.title
soup.find("h2") # 备选标题标签
)
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
# 增加对另一种内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="gallery")
# 再增加对新内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="ContentText")
else:
# 默认处理方式
title_tag = soup.find("h1") or soup.find("title")
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
content_tag = soup.find("span", id="detailContent")
# 对标题进行额外处理,去除可能的多余空白字符
title = title.strip() if title else "无标题"
if not content_tag:
print("没有找到正文,跳过:", url)
return
@@ -80,6 +135,7 @@ def process_article(url, website):
)
print(f"已保存文章及图片:{title}")
def is_valid_url(url, base_netloc):
try:
parsed = urlparse(url)
@@ -91,6 +147,7 @@ def is_valid_url(url, base_netloc):
except Exception:
return False
def full_site_crawler(start_url, website, max_pages=1000):
headers = {"User-Agent": "Mozilla/5.0"}
visited = set()
@@ -117,8 +174,30 @@ def full_site_crawler(start_url, website, max_pages=1000):
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 根据不同网站判断文章页面
is_article_page = False
if website.name == "www.news.cn":
is_article_page = soup.find("span", id="detailContent") is not None
elif website.name == "东方烟草报":
# 对于东方烟草报我们增加基于URL模式的判断
# 东方烟草报的文章URL通常包含/content/和日期格式
parsed_url = urlparse(url)
path = parsed_url.path
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="gallery") is not None or
soup.find("div", id="ContentText") is not None or
("/content/" in path and len(path) > 20)
)
else:
# 默认判断逻辑
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="content") is not None
)
# 如果是文章页面,则调用文章处理
if soup.find("span", id="detailContent"):
if is_article_page:
process_article(url, website)
pages_crawled += 1

View File

@@ -2,6 +2,7 @@ from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator
from .models import Article
def article_list(request):
"""
显示文章列表的视图函数
@@ -16,6 +17,7 @@ def article_list(request):
'page_obj': page_obj
})
def article_detail(request, article_id):
"""
显示文章详情的视图函数
@@ -24,5 +26,3 @@ def article_detail(request, article_id):
return render(request, 'core/article_detail.html', {
'article': article
})
# Create your views here.

View File

@@ -1,10 +1,15 @@
from django.contrib import admin
from django.urls import path, include
from django.conf import settings
from django.conf.urls.static import static
from django.contrib import admin
from django.urls import path, include
# 需要导入自定义的管理站点实例
from core.admin import news_cn_admin, dongfangyancao_admin
urlpatterns = [
path('admin/', admin.site.urls),
path('news_cn_admin/', news_cn_admin.urls),
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
# 以后前台访问放 core app 的 urls
path('', include('core.urls')),
]