diff --git a/src/green_classroom/apps/collector/admin.py b/src/green_classroom/apps/collector/admin.py index 4a8b5c9..200d5e7 100644 --- a/src/green_classroom/apps/collector/admin.py +++ b/src/green_classroom/apps/collector/admin.py @@ -1,13 +1,7 @@ from django.contrib import admin -from .models import Article, SourceSite +from .models import Article @admin.register(Article) class ArticleAdmin(admin.ModelAdmin): - list_display = ('title', 'category', 'publish_date', 'source', 'is_verified') - list_filter = ('category', 'source', 'is_verified') - search_fields = ('title', 'content') - -@admin.register(SourceSite) -class SourceSiteAdmin(admin.ModelAdmin): - list_display = ('name', 'url', 'is_active') - search_fields = ('name', 'url') + list_display = ('title', 'url', 'crawled') + search_fields = ('title',) diff --git a/src/green_classroom/apps/collector/crawler/xinhua.py b/src/green_classroom/apps/collector/crawler/xinhua.py index 9faaaf4..dcd9814 100644 --- a/src/green_classroom/apps/collector/crawler/xinhua.py +++ b/src/green_classroom/apps/collector/crawler/xinhua.py @@ -1,29 +1,32 @@ -# xinhua.py import requests from bs4 import BeautifulSoup +from urllib.parse import urljoin +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timedelta +import pytz # 确保已安装 pytz 库,用于处理时区 -def parse_xinhua_article(url: str): - """ - 解析新华网文章页,返回 dict 包含标题与正文 - """ - headers = { - "User-Agent": "Mozilla/5.0" - } +BASE_URL = "https://www.news.cn/" +HEADERS = { + "User-Agent": "Mozilla/5.0" +} +def parse_xinhua_article(url: str, time_range_days: int = None): + """ + 解析新华网文章页,返回 dict 包含标题与正文。 + 如果指定了 time_range_days,则只保留该时间范围内的文章。 + """ try: - resp = requests.get(url, headers=headers, timeout=10) + resp = requests.get(url, headers=HEADERS, timeout=10) resp.encoding = 'utf-8' except requests.RequestException as e: - print(f"❌ 请求失败:{e}") + print(f"❌ 请求失败:{e} URL: {url}") return None soup = BeautifulSoup(resp.text, "html.parser") - # 提取标题 title_tag = soup.find("span", class_="title") title = title_tag.get_text(strip=True) if title_tag else "无标题" - # 提取正文 content_tag = soup.find("span", id="detailContent") if not content_tag: print(f"❌ 没找到 detailContent: {url}") @@ -32,26 +35,90 @@ def parse_xinhua_article(url: str): paragraphs = content_tag.find_all("p") content = "\n".join(p.get_text(strip=True) for p in paragraphs) - # 有效性校验 if len(content.strip()) < 50: print(f"⚠️ 内容过短:{url}") return None + # 提取发布时间(假设格式为 YYYY-MM-DD) + publish_time_tag = soup.find("span", class_="publish-time") + if publish_time_tag: + publish_time_str = publish_time_tag.get_text(strip=True) + try: + publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc) + except ValueError: + print(f"❌ 无法解析时间:{publish_time_str} URL: {url}") + return None + else: + print(f"❌ 页面未找到发布时间:{url}") + return None + + # 检查时间范围 + if time_range_days is not None: + cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days) + if publish_time < cutoff_time: + print(f"⏰ 文章超出时间范围:{url}") + return None + return { "url": url, "title": title, - "content": content + "content": content, + "publish_time": publish_time } -def crawl_xinhua_green(): +def crawl_xinhua_green(time_range_days: int = None): """ - 启动新华网绿色内容爬虫的入口函数 + 爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。 + 可选参数:time_range_days(仅爬取最近指定天数内的文章) """ - print("✅ 开始爬取新华网绿色内容...") - # 示例 URL,实际应根据需求进行扩展 - test_url = "https://www.xinhua.net.cn/example-article" - result = parse_xinhua_article(test_url) - if result: - print("✅ 爬取成功:", result['title']) - else: - print("❌ 爬取失败") \ No newline at end of file + print("✅ 开始爬取新华网栏目列表...") + channels = get_channel_urls() + print(f"共找到 {len(channels)} 个频道") + + all_articles = [] + + # 并发抓取每个频道的文章链接 + with ThreadPoolExecutor(max_workers=5) as executor: + future_to_channel = { + executor.submit(get_article_urls_from_channel, ch_url): ch_url + for ch_url in channels + } + + for future in as_completed(future_to_channel): + ch_url = future_to_channel[future] + try: + articles = future.result() + print(f"\n➡️ 抓取频道:{ch_url}") + print(f" 该频道找到 {len(articles)} 篇文章") + + # 并发解析每篇文章 + with ThreadPoolExecutor(max_workers=5) as article_executor: + article_futures = { + article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url + for art_url in articles + } + + for article_future in as_completed(article_futures): + article = article_future.result() + if article: + print(f" ✔️ 文章:{article['title']}") + # 更新或创建文章,并标记 crawled=True + Article.objects.update_or_create( + url=article['url'], + defaults={ + 'title': article['title'], + 'content': article['content'], + 'crawled': True # 标记为已爬取 + } + ) + all_articles.append(article) + else: + print(f" ❌ 文章解析失败:{article_futures[article_future]}") + except Exception as exc: + print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}") + + print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇") + return all_articles + +if __name__ == "__main__": + crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章 diff --git a/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py b/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py index 7b34c27..ac3dea9 100644 --- a/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py +++ b/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py @@ -1,10 +1,10 @@ -# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py - from django.core.management.base import BaseCommand from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green class Command(BaseCommand): - help = "抓取新华网绿色发展相关资料" + help = '爬取新华网文章并保存到数据库' def handle(self, *args, **kwargs): - crawl_xinhua_green() + self.stdout.write("开始爬取...") + articles = crawl_xinhua_green() + self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章") \ No newline at end of file diff --git a/src/green_classroom/apps/collector/migrations/0002_rename_is_verified_article_crawled_and_more.py b/src/green_classroom/apps/collector/migrations/0002_rename_is_verified_article_crawled_and_more.py new file mode 100644 index 0000000..daec8eb --- /dev/null +++ b/src/green_classroom/apps/collector/migrations/0002_rename_is_verified_article_crawled_and_more.py @@ -0,0 +1,39 @@ +# Generated by Django 5.1 on 2025-07-21 12:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('collector', '0001_initial'), + ] + + operations = [ + migrations.RenameField( + model_name='article', + old_name='is_verified', + new_name='crawled', + ), + migrations.RemoveField( + model_name='article', + name='category', + ), + migrations.RemoveField( + model_name='article', + name='created_at', + ), + migrations.RemoveField( + model_name='article', + name='publish_date', + ), + migrations.RemoveField( + model_name='article', + name='source', + ), + migrations.AlterField( + model_name='article', + name='title', + field=models.CharField(max_length=255), + ), + ] diff --git a/src/green_classroom/apps/collector/models.py b/src/green_classroom/apps/collector/models.py index 872e01f..627b79d 100644 --- a/src/green_classroom/apps/collector/models.py +++ b/src/green_classroom/apps/collector/models.py @@ -10,21 +10,10 @@ class SourceSite(models.Model): return self.name class Article(models.Model): - CATEGORY_CHOICES = [ - ('政策', '政策'), - ('案例', '案例'), - ('新闻', '新闻'), - ('科研', '科研'), - ] - - title = models.CharField(max_length=300) url = models.URLField(unique=True) - publish_date = models.DateField(null=True, blank=True) + title = models.CharField(max_length=255) content = models.TextField() - category = models.CharField(max_length=100, choices=CATEGORY_CHOICES) - source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True) - created_at = models.DateTimeField(auto_now_add=True) - is_verified = models.BooleanField(default=False) + crawled = models.BooleanField(default=False) # 确保此字段存在 def __str__(self): return self.title diff --git a/src/green_classroom/apps/collector/urls.py b/src/green_classroom/apps/collector/urls.py index d220e8b..2b26fea 100644 --- a/src/green_classroom/apps/collector/urls.py +++ b/src/green_classroom/apps/collector/urls.py @@ -1,9 +1,10 @@ from django.urls import path -from . import views +from green_classroom.apps.collector import views -app_name = 'collector' +app_name = 'collector' # 确保命名空间正确 urlpatterns = [ - path('', views.article_list, name='article_list'), - path('/', views.article_detail, name='article_detail'), -] + path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'), + path('article//', views.article_detail, name='article_detail'), + path('articles/', views.list_articles, name='article_list'), # 添加这一行 +] \ No newline at end of file diff --git a/src/green_classroom/apps/collector/views.py b/src/green_classroom/apps/collector/views.py index 0e28803..1e03f69 100644 --- a/src/green_classroom/apps/collector/views.py +++ b/src/green_classroom/apps/collector/views.py @@ -1,20 +1,51 @@ from django.shortcuts import render, get_object_or_404 -from .models import Article -from django.db.models import Q +from django.core.management import call_command +from green_classroom.apps.collector.models import Article +import os +from django.conf import settings +from django.template import TemplateDoesNotExist -def article_list(request): - query = request.GET.get('q', '') - articles = Article.objects.filter(is_verified=True) +def list_articles(request): + """ + 展示所有文章的视图 + """ + articles = Article.objects.all() + return render(request, 'collector/article_list.html', {'articles': articles}) - if query: - articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query)) +def article_detail(request, article_id): + """ + 展示单篇文章的详细内容 + """ + article = get_object_or_404(Article, id=article_id) + return render(request, 'collector/article_detail.html', {'article': article}) - return render(request, 'collector/article_list.html', { - 'articles': articles.order_by('-publish_date')[:100] - }) +def run_crawler(request): + result = [] + if request.method == 'POST': + # 调用爬虫命令并获取输出 + from io import StringIO + output = StringIO() + call_command('crawl_xinhua', stdout=output) + result.append(output.getvalue()) + + # 调试:打印模板路径 + template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html') + print(f"🔍 正在查找模板文件:{template_path}") + + # 调试:检查模板是否存在 + try: + with open(template_path, 'r', encoding='utf-8') as f: + print("✅ 模板文件存在") + except FileNotFoundError: + print("❌ 模板文件不存在,请检查路径") + + return render(request, 'collector/run_crawler.html', {'output': result}) -def article_detail(request, pk): - article = get_object_or_404(Article, pk=pk, is_verified=True) - return render(request, 'collector/article_detail.html', { - 'article': article - }) +def delete_all_articles(request): + """ + 删除所有文章的视图 + """ + if request.method == 'POST': + Article.objects.all().delete() + return redirect('collector:article_list') + return render(request, 'collector/delete_all_articles.html') diff --git a/src/green_classroom/settings.py b/src/green_classroom/settings.py index f15efac..4836e01 100644 --- a/src/green_classroom/settings.py +++ b/src/green_classroom/settings.py @@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/5.1/ref/settings/ """ from pathlib import Path +import os # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent @@ -55,7 +56,7 @@ ROOT_URLCONF = 'green_classroom.urls' TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': [], + 'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在 'APP_DIRS': True, 'OPTIONS': { 'context_processors': [ @@ -122,3 +123,6 @@ STATIC_URL = 'static/' # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + +# 增加最大字段数限制,适应 admin 页面大数据量展示 +DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240 diff --git a/src/green_classroom/templates/collector/article_detail.html b/src/green_classroom/templates/collector/article_detail.html new file mode 100644 index 0000000..e3213fb --- /dev/null +++ b/src/green_classroom/templates/collector/article_detail.html @@ -0,0 +1,11 @@ + + + + {{ article.title }} + + +

{{ article.title }}

+

{{ article.content }}

+ 返回文章列表 + + \ No newline at end of file diff --git a/src/green_classroom/templates/collector/article_list.html b/src/green_classroom/templates/collector/article_list.html new file mode 100644 index 0000000..1412a91 --- /dev/null +++ b/src/green_classroom/templates/collector/article_list.html @@ -0,0 +1,19 @@ + + + + 文章列表 + + +

所有文章

+ + + \ No newline at end of file diff --git a/src/green_classroom/templates/collector/delete_all_articles.html b/src/green_classroom/templates/collector/delete_all_articles.html new file mode 100644 index 0000000..4682ff1 --- /dev/null +++ b/src/green_classroom/templates/collector/delete_all_articles.html @@ -0,0 +1,15 @@ + + + + 删除所有文章 + + +

确认删除所有文章

+

您确定要删除所有文章吗?此操作不可撤销。

+
+ {% csrf_token %} + +
+ 取消 + + \ No newline at end of file diff --git a/src/green_classroom/templates/collector/run_crawler.html b/src/green_classroom/templates/collector/run_crawler.html new file mode 100644 index 0000000..1b9e602 --- /dev/null +++ b/src/green_classroom/templates/collector/run_crawler.html @@ -0,0 +1,17 @@ + + + + 运行爬虫 + + +

运行爬虫

+
+ {% csrf_token %} + +
+ {% if output %} +

输出:

+
{{ output }}
+ {% endif %} + + \ No newline at end of file diff --git a/src/green_classroom/urls.py b/src/green_classroom/urls.py index 3d5312c..b5b4007 100644 --- a/src/green_classroom/urls.py +++ b/src/green_classroom/urls.py @@ -1,7 +1,9 @@ -from django.contrib import admin from django.urls import path, include +from django.contrib import admin # 新增导入 +from green_classroom.apps.collector import views urlpatterns = [ - path('admin/', admin.site.urls), - path('collector/', include('green_classroom.apps.collector.urls')), -] + path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')), + path('articles/', views.list_articles, name='article_list'), + path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问 +] \ No newline at end of file