xinhua_base

This commit is contained in:
2025-08-11 13:10:23 +08:00
parent b43443551f
commit 4e5e35b4fa
20 changed files with 427 additions and 1 deletions

0
core/__init__.py Normal file
View File

11
core/admin.py Normal file
View File

@@ -0,0 +1,11 @@
from django.contrib import admin
from .models import Website, Article
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled')
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content')

6
core/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class CoreConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'core'

View File

@@ -0,0 +1,23 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_article
class Command(BaseCommand):
help = '爬取新华网文章示例'
def handle(self, *args, **options):
# 假设你事先在后台建了“新华网”这个Website实例
website_name = "新华网"
try:
website = Website.objects.get(name=website_name)
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
return
# 这里写你想爬取的文章URL列表可以循环多篇
urls = [
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
]
for url in urls:
crawl_xinhua_article(url, website)

View File

@@ -0,0 +1,41 @@
# Generated by Django 5.1 on 2025-08-11 04:53
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Website',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100, unique=True)),
('base_url', models.URLField()),
('description', models.TextField(blank=True, null=True)),
('article_list_url', models.URLField(blank=True, null=True)),
('article_selector', models.CharField(blank=True, max_length=255, null=True)),
('content_selector', models.CharField(blank=True, max_length=255, null=True)),
('enabled', models.BooleanField(default=True)),
],
),
migrations.CreateModel(
name='Article',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('title', models.CharField(max_length=300)),
('url', models.URLField(unique=True)),
('pub_date', models.DateTimeField(blank=True, null=True)),
('content', models.TextField()),
('created_at', models.DateTimeField(auto_now_add=True)),
('media_files', models.JSONField(blank=True, default=list)),
('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.website')),
],
),
]

View File

26
core/models.py Normal file
View File

@@ -0,0 +1,26 @@
from django.db import models
class Website(models.Model):
name = models.CharField(max_length=100, unique=True)
base_url = models.URLField()
description = models.TextField(blank=True, null=True)
article_list_url = models.URLField(blank=True, null=True)
article_selector = models.CharField(max_length=255, blank=True, null=True)
content_selector = models.CharField(max_length=255, blank=True, null=True)
enabled = models.BooleanField(default=True)
def __str__(self):
return self.name
class Article(models.Model):
website = models.ForeignKey(Website, on_delete=models.CASCADE)
title = models.CharField(max_length=300)
url = models.URLField(unique=True)
pub_date = models.DateTimeField(null=True, blank=True)
content = models.TextField() # html内容
created_at = models.DateTimeField(auto_now_add=True)
media_files = models.JSONField(default=list, blank=True)
def __str__(self):
return self.title

View File

@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<title>{{ article.title }}</title>
</head>
<body>
<h1>{{ article.title }}</h1>
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
<hr />
<div>
{{ article.content|safe }}
</div>
<hr />
<p><a href="{% url 'article_list' %}">返回列表</a></p>
</body>
</html>

View File

@@ -0,0 +1,33 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<title>绿色课堂文章列表</title>
</head>
<body>
<h1>绿色课堂文章列表</h1>
<ul>
{% for article in page_obj %}
<li>
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
({{ article.created_at|date:"Y-m-d" }})
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
{% if page_obj.has_next %}
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
{% endif %}
</div>
</body>
</html>

3
core/tests.py Normal file
View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

10
core/urls.py Normal file
View File

@@ -0,0 +1,10 @@
from django.urls import path
from . import views
urlpatterns = [
# 主页,文章列表
path('', views.article_list, name='article_list'),
# 文章详情
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
# 后续可以加更多路径
]

35
core/utils.py Normal file
View File

@@ -0,0 +1,35 @@
import requests
from bs4 import BeautifulSoup
from django.utils import timezone
from core.models import Website, Article
def crawl_xinhua_article(url, website):
headers = {
"User-Agent": "Mozilla/5.0"
}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 提取正文
content_tag = soup.find("span", id="detailContent")
paragraphs = content_tag.find_all("p") if content_tag else []
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
# 如果文章已存在,则不重复插入
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create(
website=website,
title=title,
url=url,
content=content_html,
pub_date=timezone.now(),
)
print(f"已保存文章:{title}")

28
core/views.py Normal file
View File

@@ -0,0 +1,28 @@
from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator
from .models import Article
def article_list(request):
"""
显示文章列表的视图函数
"""
articles = Article.objects.all().order_by('-created_at')
paginator = Paginator(articles, 10) # 每页显示10篇文章
page_number = request.GET.get('page')
page_obj = paginator.get_page(page_number)
return render(request, 'core/article_list.html', {
'page_obj': page_obj
})
def article_detail(request, article_id):
"""
显示文章详情的视图函数
"""
article = get_object_or_404(Article, id=article_id)
return render(request, 'core/article_detail.html', {
'article': article
})
# Create your views here.