Test success
This commit is contained in:
@@ -1,3 +1,13 @@
|
||||
from django.contrib import admin
|
||||
from .models import Article, SourceSite
|
||||
|
||||
# Register your models here.
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'category', 'publish_date', 'source', 'is_verified')
|
||||
list_filter = ('category', 'source', 'is_verified')
|
||||
search_fields = ('title', 'content')
|
||||
|
||||
@admin.register(SourceSite)
|
||||
class SourceSiteAdmin(admin.ModelAdmin):
|
||||
list_display = ('name', 'url', 'is_active')
|
||||
search_fields = ('name', 'url')
|
||||
|
||||
@@ -3,4 +3,4 @@ from django.apps import AppConfig
|
||||
|
||||
class CollectorConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'collector'
|
||||
name = 'green_classroom.apps.collector' # 修改为完整路径
|
||||
57
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
57
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# xinhua.py
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def parse_xinhua_article(url: str):
|
||||
"""
|
||||
解析新华网文章页,返回 dict 包含标题与正文
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=10)
|
||||
resp.encoding = 'utf-8'
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ 请求失败:{e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 提取标题
|
||||
title_tag = soup.find("span", class_="title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
# 提取正文
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
if not content_tag:
|
||||
print(f"❌ 没找到 detailContent: {url}")
|
||||
return None
|
||||
|
||||
paragraphs = content_tag.find_all("p")
|
||||
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
||||
|
||||
# 有效性校验
|
||||
if len(content.strip()) < 50:
|
||||
print(f"⚠️ 内容过短:{url}")
|
||||
return None
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content
|
||||
}
|
||||
|
||||
def crawl_xinhua_green():
|
||||
"""
|
||||
启动新华网绿色内容爬虫的入口函数
|
||||
"""
|
||||
print("✅ 开始爬取新华网绿色内容...")
|
||||
# 示例 URL,实际应根据需求进行扩展
|
||||
test_url = "https://www.xinhua.net.cn/example-article"
|
||||
result = parse_xinhua_article(test_url)
|
||||
if result:
|
||||
print("✅ 爬取成功:", result['title'])
|
||||
else:
|
||||
print("❌ 爬取失败")
|
||||
@@ -0,0 +1,10 @@
|
||||
# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "抓取新华网绿色发展相关资料"
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
crawl_xinhua_green()
|
||||
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 5.1 on 2025-07-21 12:00
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='SourceSite',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100)),
|
||||
('url', models.URLField()),
|
||||
('is_active', models.BooleanField(default=True)),
|
||||
('remarks', models.TextField(blank=True)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Article',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('title', models.CharField(max_length=300)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('publish_date', models.DateField(blank=True, null=True)),
|
||||
('content', models.TextField()),
|
||||
('category', models.CharField(choices=[('政策', '政策'), ('案例', '案例'), ('新闻', '新闻'), ('科研', '科研')], max_length=100)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('is_verified', models.BooleanField(default=False)),
|
||||
('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='collector.sourcesite')),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -1,3 +1,30 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
class SourceSite(models.Model):
|
||||
name = models.CharField(max_length=100)
|
||||
url = models.URLField()
|
||||
is_active = models.BooleanField(default=True)
|
||||
remarks = models.TextField(blank=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
class Article(models.Model):
|
||||
CATEGORY_CHOICES = [
|
||||
('政策', '政策'),
|
||||
('案例', '案例'),
|
||||
('新闻', '新闻'),
|
||||
('科研', '科研'),
|
||||
]
|
||||
|
||||
title = models.CharField(max_length=300)
|
||||
url = models.URLField(unique=True)
|
||||
publish_date = models.DateField(null=True, blank=True)
|
||||
content = models.TextField()
|
||||
category = models.CharField(max_length=100, choices=CATEGORY_CHOICES)
|
||||
source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
is_verified = models.BooleanField(default=False)
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{ article.title }}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{ article.title }}</h1>
|
||||
<p><strong>来源:</strong>{{ article.source.name }}</p>
|
||||
<p><strong>分类:</strong>{{ article.category }}</p>
|
||||
<p><strong>发布时间:</strong>{{ article.publish_date }}</p>
|
||||
<div>
|
||||
{{ article.content|linebreaks }}
|
||||
</div>
|
||||
<p><a href="{% url 'collector:article_list' %}">返回列表</a></p>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>绿色课堂资料库</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>绿色课堂资料库</h1>
|
||||
<form method="get">
|
||||
<input type="text" name="q" placeholder="关键词搜索..." value="{{ request.GET.q }}">
|
||||
<button type="submit">搜索</button>
|
||||
</form>
|
||||
|
||||
<ul>
|
||||
{% for article in articles %}
|
||||
<li>
|
||||
<a href="{% url 'collector:article_detail' article.id %}">{{ article.title }}</a>
|
||||
({{ article.category }} | {{ article.publish_date }})
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无内容</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
9
src/green_classroom/apps/collector/urls.py
Normal file
9
src/green_classroom/apps/collector/urls.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from django.urls import path
|
||||
from . import views
|
||||
|
||||
app_name = 'collector'
|
||||
|
||||
urlpatterns = [
|
||||
path('', views.article_list, name='article_list'),
|
||||
path('<int:pk>/', views.article_detail, name='article_detail'),
|
||||
]
|
||||
@@ -1,3 +1,20 @@
|
||||
from django.shortcuts import render
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from .models import Article
|
||||
from django.db.models import Q
|
||||
|
||||
# Create your views here.
|
||||
def article_list(request):
|
||||
query = request.GET.get('q', '')
|
||||
articles = Article.objects.filter(is_verified=True)
|
||||
|
||||
if query:
|
||||
articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query))
|
||||
|
||||
return render(request, 'collector/article_list.html', {
|
||||
'articles': articles.order_by('-publish_date')[:100]
|
||||
})
|
||||
|
||||
def article_detail(request, pk):
|
||||
article = get_object_or_404(Article, pk=pk, is_verified=True)
|
||||
return render(request, 'collector/article_detail.html', {
|
||||
'article': article
|
||||
})
|
||||
|
||||
@@ -37,6 +37,7 @@ INSTALLED_APPS = [
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'green_classroom.apps.collector.apps.CollectorConfig',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
|
||||
@@ -1,22 +1,7 @@
|
||||
"""
|
||||
URL configuration for green_classroom project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/5.1/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path
|
||||
from django.urls import path, include
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('collector/', include('green_classroom.apps.collector.urls')),
|
||||
]
|
||||
|
||||
10
src/test_xinhua_article.py
Normal file
10
src/test_xinhua_article.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from green_classroom.apps.collector.crawler.xinhua import parse_xinhua_article
|
||||
|
||||
url = "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html"
|
||||
article = parse_xinhua_article(url)
|
||||
|
||||
if article:
|
||||
print("✅ 成功抓取文章:", article["title"])
|
||||
print("📄 正文预览:\n", article["content"][:500])
|
||||
else:
|
||||
print("❌ 抓取失败")
|
||||
Reference in New Issue
Block a user