xinhua_base
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -166,7 +166,7 @@ cython_debug/
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
.idea/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
0
core/__init__.py
Normal file
0
core/__init__.py
Normal file
11
core/admin.py
Normal file
11
core/admin.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from django.contrib import admin
|
||||
from .models import Website, Article
|
||||
|
||||
@admin.register(Website)
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
list_display = ('name', 'base_url', 'enabled')
|
||||
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'website', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
6
core/apps.py
Normal file
6
core/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'core'
|
||||
23
core/management/commands/crawl_xinhua.py
Normal file
23
core/management/commands/crawl_xinhua.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import crawl_xinhua_article
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '爬取新华网文章示例'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 假设你事先在后台建了“新华网”这个Website实例
|
||||
website_name = "新华网"
|
||||
try:
|
||||
website = Website.objects.get(name=website_name)
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
|
||||
return
|
||||
|
||||
# 这里写你想爬取的文章URL列表,可以循环多篇
|
||||
urls = [
|
||||
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
|
||||
]
|
||||
|
||||
for url in urls:
|
||||
crawl_xinhua_article(url, website)
|
||||
41
core/migrations/0001_initial.py
Normal file
41
core/migrations/0001_initial.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# Generated by Django 5.1 on 2025-08-11 04:53
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Website',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100, unique=True)),
|
||||
('base_url', models.URLField()),
|
||||
('description', models.TextField(blank=True, null=True)),
|
||||
('article_list_url', models.URLField(blank=True, null=True)),
|
||||
('article_selector', models.CharField(blank=True, max_length=255, null=True)),
|
||||
('content_selector', models.CharField(blank=True, max_length=255, null=True)),
|
||||
('enabled', models.BooleanField(default=True)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Article',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('title', models.CharField(max_length=300)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('pub_date', models.DateTimeField(blank=True, null=True)),
|
||||
('content', models.TextField()),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('media_files', models.JSONField(blank=True, default=list)),
|
||||
('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.website')),
|
||||
],
|
||||
),
|
||||
]
|
||||
0
core/migrations/__init__.py
Normal file
0
core/migrations/__init__.py
Normal file
26
core/models.py
Normal file
26
core/models.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from django.db import models
|
||||
|
||||
class Website(models.Model):
|
||||
name = models.CharField(max_length=100, unique=True)
|
||||
base_url = models.URLField()
|
||||
description = models.TextField(blank=True, null=True)
|
||||
article_list_url = models.URLField(blank=True, null=True)
|
||||
article_selector = models.CharField(max_length=255, blank=True, null=True)
|
||||
content_selector = models.CharField(max_length=255, blank=True, null=True)
|
||||
enabled = models.BooleanField(default=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class Article(models.Model):
|
||||
website = models.ForeignKey(Website, on_delete=models.CASCADE)
|
||||
title = models.CharField(max_length=300)
|
||||
url = models.URLField(unique=True)
|
||||
pub_date = models.DateTimeField(null=True, blank=True)
|
||||
content = models.TextField() # html内容
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
media_files = models.JSONField(default=list, blank=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
17
core/templates/core/article_detail.html
Normal file
17
core/templates/core/article_detail.html
Normal file
@@ -0,0 +1,17 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>{{ article.title }}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{ article.title }}</h1>
|
||||
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
|
||||
<hr />
|
||||
<div>
|
||||
{{ article.content|safe }}
|
||||
</div>
|
||||
<hr />
|
||||
<p><a href="{% url 'article_list' %}">返回列表</a></p>
|
||||
</body>
|
||||
</html>
|
||||
33
core/templates/core/article_list.html
Normal file
33
core/templates/core/article_list.html
Normal file
@@ -0,0 +1,33 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>绿色课堂文章列表</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>绿色课堂文章列表</h1>
|
||||
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
({{ article.created_at|date:"Y-m-d" }})
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
<div class="pagination">
|
||||
{% if page_obj.has_previous %}
|
||||
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
|
||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
3
core/tests.py
Normal file
3
core/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
10
core/urls.py
Normal file
10
core/urls.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from django.urls import path
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
# 主页,文章列表
|
||||
path('', views.article_list, name='article_list'),
|
||||
# 文章详情
|
||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||
# 后续可以加更多路径
|
||||
]
|
||||
35
core/utils.py
Normal file
35
core/utils.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from django.utils import timezone
|
||||
from core.models import Website, Article
|
||||
|
||||
def crawl_xinhua_article(url, website):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 提取标题
|
||||
title_tag = soup.find("span", class_="title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
# 提取正文
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
paragraphs = content_tag.find_all("p") if content_tag else []
|
||||
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
|
||||
|
||||
# 如果文章已存在,则不重复插入
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
url=url,
|
||||
content=content_html,
|
||||
pub_date=timezone.now(),
|
||||
)
|
||||
print(f"已保存文章:{title}")
|
||||
28
core/views.py
Normal file
28
core/views.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.core.paginator import Paginator
|
||||
from .models import Article
|
||||
|
||||
def article_list(request):
|
||||
"""
|
||||
显示文章列表的视图函数
|
||||
"""
|
||||
articles = Article.objects.all().order_by('-created_at')
|
||||
paginator = Paginator(articles, 10) # 每页显示10篇文章
|
||||
|
||||
page_number = request.GET.get('page')
|
||||
page_obj = paginator.get_page(page_number)
|
||||
|
||||
return render(request, 'core/article_list.html', {
|
||||
'page_obj': page_obj
|
||||
})
|
||||
|
||||
def article_detail(request, article_id):
|
||||
"""
|
||||
显示文章详情的视图函数
|
||||
"""
|
||||
article = get_object_or_404(Article, id=article_id)
|
||||
return render(request, 'core/article_detail.html', {
|
||||
'article': article
|
||||
})
|
||||
|
||||
# Create your views here.
|
||||
0
green_classroom/__init__.py
Normal file
0
green_classroom/__init__.py
Normal file
16
green_classroom/asgi.py
Normal file
16
green_classroom/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
ASGI config for green_classroom project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
|
||||
application = get_asgi_application()
|
||||
126
green_classroom/settings.py
Normal file
126
green_classroom/settings.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""
|
||||
Django settings for green_classroom project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 5.1.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'core',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'green_classroom.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'green_classroom.wsgi.application'
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'zh-Hans'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
import os
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
|
||||
MEDIA_URL = '/media/'
|
||||
13
green_classroom/urls.py
Normal file
13
green_classroom/urls.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
from django.conf import settings
|
||||
from django.conf.urls.static import static
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
# 以后前台访问放 core app 的 urls
|
||||
path('', include('core.urls')),
|
||||
]
|
||||
|
||||
if settings.DEBUG:
|
||||
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
16
green_classroom/wsgi.py
Normal file
16
green_classroom/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
WSGI config for green_classroom project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
|
||||
application = get_wsgi_application()
|
||||
22
manage.py
Executable file
22
manage.py
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user