From c750d77eab4d95ce5e8df1eb4da36deb09584a06 Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Mon, 21 Jul 2025 20:40:36 +0800 Subject: [PATCH] Test success --- .idea/misc.xml | 3 + src/green_classroom/apps/collector/admin.py | 12 +++- src/green_classroom/apps/collector/apps.py | 2 +- .../apps/collector/crawler/__init__.py | 0 .../apps/collector/crawler/xinhua.py | 57 +++++++++++++++++++ .../apps/collector/management/__init__.py | 0 .../collector/management/commands/__init__.py | 0 .../management/commands/crawl_xinhua.py | 10 ++++ .../apps/collector/migrations/0001_initial.py | 39 +++++++++++++ src/green_classroom/apps/collector/models.py | 29 +++++++++- .../templates/collector/article_detail.html | 16 ++++++ .../templates/collector/article_list.html | 24 ++++++++ src/green_classroom/apps/collector/urls.py | 9 +++ src/green_classroom/apps/collector/views.py | 21 ++++++- src/green_classroom/settings.py | 1 + src/green_classroom/urls.py | 19 +------ src/test_xinhua_article.py | 10 ++++ 17 files changed, 230 insertions(+), 22 deletions(-) create mode 100644 src/green_classroom/apps/collector/crawler/__init__.py create mode 100644 src/green_classroom/apps/collector/crawler/xinhua.py create mode 100644 src/green_classroom/apps/collector/management/__init__.py create mode 100644 src/green_classroom/apps/collector/management/commands/__init__.py create mode 100644 src/green_classroom/apps/collector/management/commands/crawl_xinhua.py create mode 100644 src/green_classroom/apps/collector/migrations/0001_initial.py create mode 100644 src/green_classroom/apps/collector/templates/collector/article_detail.html create mode 100644 src/green_classroom/apps/collector/templates/collector/article_list.html create mode 100644 src/green_classroom/apps/collector/urls.py create mode 100644 src/test_xinhua_article.py diff --git a/.idea/misc.xml b/.idea/misc.xml index 3232bb5..e4b6925 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ + + \ No newline at end of file diff --git a/src/green_classroom/apps/collector/admin.py b/src/green_classroom/apps/collector/admin.py index 8c38f3f..4a8b5c9 100644 --- a/src/green_classroom/apps/collector/admin.py +++ b/src/green_classroom/apps/collector/admin.py @@ -1,3 +1,13 @@ from django.contrib import admin +from .models import Article, SourceSite -# Register your models here. +@admin.register(Article) +class ArticleAdmin(admin.ModelAdmin): + list_display = ('title', 'category', 'publish_date', 'source', 'is_verified') + list_filter = ('category', 'source', 'is_verified') + search_fields = ('title', 'content') + +@admin.register(SourceSite) +class SourceSiteAdmin(admin.ModelAdmin): + list_display = ('name', 'url', 'is_active') + search_fields = ('name', 'url') diff --git a/src/green_classroom/apps/collector/apps.py b/src/green_classroom/apps/collector/apps.py index 0117b57..c1e37fc 100644 --- a/src/green_classroom/apps/collector/apps.py +++ b/src/green_classroom/apps/collector/apps.py @@ -3,4 +3,4 @@ from django.apps import AppConfig class CollectorConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' - name = 'collector' + name = 'green_classroom.apps.collector' # 修改为完整路径 \ No newline at end of file diff --git a/src/green_classroom/apps/collector/crawler/__init__.py b/src/green_classroom/apps/collector/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/green_classroom/apps/collector/crawler/xinhua.py b/src/green_classroom/apps/collector/crawler/xinhua.py new file mode 100644 index 0000000..9faaaf4 --- /dev/null +++ b/src/green_classroom/apps/collector/crawler/xinhua.py @@ -0,0 +1,57 @@ +# xinhua.py +import requests +from bs4 import BeautifulSoup + +def parse_xinhua_article(url: str): + """ + 解析新华网文章页,返回 dict 包含标题与正文 + """ + headers = { + "User-Agent": "Mozilla/5.0" + } + + try: + resp = requests.get(url, headers=headers, timeout=10) + resp.encoding = 'utf-8' + except requests.RequestException as e: + print(f"❌ 请求失败:{e}") + return None + + soup = BeautifulSoup(resp.text, "html.parser") + + # 提取标题 + title_tag = soup.find("span", class_="title") + title = title_tag.get_text(strip=True) if title_tag else "无标题" + + # 提取正文 + content_tag = soup.find("span", id="detailContent") + if not content_tag: + print(f"❌ 没找到 detailContent: {url}") + return None + + paragraphs = content_tag.find_all("p") + content = "\n".join(p.get_text(strip=True) for p in paragraphs) + + # 有效性校验 + if len(content.strip()) < 50: + print(f"⚠️ 内容过短:{url}") + return None + + return { + "url": url, + "title": title, + "content": content + } + +def crawl_xinhua_green(): + """ + 启动新华网绿色内容爬虫的入口函数 + """ + print("✅ 开始爬取新华网绿色内容...") + # 示例 URL,实际应根据需求进行扩展 + test_url = "https://www.xinhua.net.cn/example-article" + result = parse_xinhua_article(test_url) + if result: + print("✅ 爬取成功:", result['title']) + else: + print("❌ 爬取失败") \ No newline at end of file diff --git a/src/green_classroom/apps/collector/management/__init__.py b/src/green_classroom/apps/collector/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/green_classroom/apps/collector/management/commands/__init__.py b/src/green_classroom/apps/collector/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py b/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py new file mode 100644 index 0000000..7b34c27 --- /dev/null +++ b/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py @@ -0,0 +1,10 @@ +# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py + +from django.core.management.base import BaseCommand +from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green + +class Command(BaseCommand): + help = "抓取新华网绿色发展相关资料" + + def handle(self, *args, **kwargs): + crawl_xinhua_green() diff --git a/src/green_classroom/apps/collector/migrations/0001_initial.py b/src/green_classroom/apps/collector/migrations/0001_initial.py new file mode 100644 index 0000000..442cc2e --- /dev/null +++ b/src/green_classroom/apps/collector/migrations/0001_initial.py @@ -0,0 +1,39 @@ +# Generated by Django 5.1 on 2025-07-21 12:00 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='SourceSite', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=100)), + ('url', models.URLField()), + ('is_active', models.BooleanField(default=True)), + ('remarks', models.TextField(blank=True)), + ], + ), + migrations.CreateModel( + name='Article', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(max_length=300)), + ('url', models.URLField(unique=True)), + ('publish_date', models.DateField(blank=True, null=True)), + ('content', models.TextField()), + ('category', models.CharField(choices=[('政策', '政策'), ('案例', '案例'), ('新闻', '新闻'), ('科研', '科研')], max_length=100)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('is_verified', models.BooleanField(default=False)), + ('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='collector.sourcesite')), + ], + ), + ] diff --git a/src/green_classroom/apps/collector/models.py b/src/green_classroom/apps/collector/models.py index 71a8362..872e01f 100644 --- a/src/green_classroom/apps/collector/models.py +++ b/src/green_classroom/apps/collector/models.py @@ -1,3 +1,30 @@ from django.db import models -# Create your models here. +class SourceSite(models.Model): + name = models.CharField(max_length=100) + url = models.URLField() + is_active = models.BooleanField(default=True) + remarks = models.TextField(blank=True) + + def __str__(self): + return self.name + +class Article(models.Model): + CATEGORY_CHOICES = [ + ('政策', '政策'), + ('案例', '案例'), + ('新闻', '新闻'), + ('科研', '科研'), + ] + + title = models.CharField(max_length=300) + url = models.URLField(unique=True) + publish_date = models.DateField(null=True, blank=True) + content = models.TextField() + category = models.CharField(max_length=100, choices=CATEGORY_CHOICES) + source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True) + created_at = models.DateTimeField(auto_now_add=True) + is_verified = models.BooleanField(default=False) + + def __str__(self): + return self.title diff --git a/src/green_classroom/apps/collector/templates/collector/article_detail.html b/src/green_classroom/apps/collector/templates/collector/article_detail.html new file mode 100644 index 0000000..956b661 --- /dev/null +++ b/src/green_classroom/apps/collector/templates/collector/article_detail.html @@ -0,0 +1,16 @@ + + + + {{ article.title }} + + +

{{ article.title }}

+

来源:{{ article.source.name }}

+

分类:{{ article.category }}

+

发布时间:{{ article.publish_date }}

+
+ {{ article.content|linebreaks }} +
+

返回列表

+ + diff --git a/src/green_classroom/apps/collector/templates/collector/article_list.html b/src/green_classroom/apps/collector/templates/collector/article_list.html new file mode 100644 index 0000000..b41bd1f --- /dev/null +++ b/src/green_classroom/apps/collector/templates/collector/article_list.html @@ -0,0 +1,24 @@ + + + + 绿色课堂资料库 + + +

绿色课堂资料库

+
+ + +
+ + + + diff --git a/src/green_classroom/apps/collector/urls.py b/src/green_classroom/apps/collector/urls.py new file mode 100644 index 0000000..d220e8b --- /dev/null +++ b/src/green_classroom/apps/collector/urls.py @@ -0,0 +1,9 @@ +from django.urls import path +from . import views + +app_name = 'collector' + +urlpatterns = [ + path('', views.article_list, name='article_list'), + path('/', views.article_detail, name='article_detail'), +] diff --git a/src/green_classroom/apps/collector/views.py b/src/green_classroom/apps/collector/views.py index 91ea44a..0e28803 100644 --- a/src/green_classroom/apps/collector/views.py +++ b/src/green_classroom/apps/collector/views.py @@ -1,3 +1,20 @@ -from django.shortcuts import render +from django.shortcuts import render, get_object_or_404 +from .models import Article +from django.db.models import Q -# Create your views here. +def article_list(request): + query = request.GET.get('q', '') + articles = Article.objects.filter(is_verified=True) + + if query: + articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query)) + + return render(request, 'collector/article_list.html', { + 'articles': articles.order_by('-publish_date')[:100] + }) + +def article_detail(request, pk): + article = get_object_or_404(Article, pk=pk, is_verified=True) + return render(request, 'collector/article_detail.html', { + 'article': article + }) diff --git a/src/green_classroom/settings.py b/src/green_classroom/settings.py index b6cf62a..f15efac 100644 --- a/src/green_classroom/settings.py +++ b/src/green_classroom/settings.py @@ -37,6 +37,7 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'green_classroom.apps.collector.apps.CollectorConfig', ] MIDDLEWARE = [ diff --git a/src/green_classroom/urls.py b/src/green_classroom/urls.py index 3df1d83..3d5312c 100644 --- a/src/green_classroom/urls.py +++ b/src/green_classroom/urls.py @@ -1,22 +1,7 @@ -""" -URL configuration for green_classroom project. - -The `urlpatterns` list routes URLs to views. For more information please see: - https://docs.djangoproject.com/en/5.1/topics/http/urls/ -Examples: -Function views - 1. Add an import: from my_app import views - 2. Add a URL to urlpatterns: path('', views.home, name='home') -Class-based views - 1. Add an import: from other_app.views import Home - 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') -Including another URLconf - 1. Import the include() function: from django.urls import include, path - 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) -""" from django.contrib import admin -from django.urls import path +from django.urls import path, include urlpatterns = [ path('admin/', admin.site.urls), + path('collector/', include('green_classroom.apps.collector.urls')), ] diff --git a/src/test_xinhua_article.py b/src/test_xinhua_article.py new file mode 100644 index 0000000..7d9d3d8 --- /dev/null +++ b/src/test_xinhua_article.py @@ -0,0 +1,10 @@ +from green_classroom.apps.collector.crawler.xinhua import parse_xinhua_article + +url = "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html" +article = parse_xinhua_article(url) + +if article: + print("✅ 成功抓取文章:", article["title"]) + print("📄 正文预览:\n", article["content"][:500]) +else: + print("❌ 抓取失败")