add 多线程 && 新华网
This commit is contained in:
@@ -1,13 +1,7 @@
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from .models import Article, SourceSite
|
from .models import Article
|
||||||
|
|
||||||
@admin.register(Article)
|
@admin.register(Article)
|
||||||
class ArticleAdmin(admin.ModelAdmin):
|
class ArticleAdmin(admin.ModelAdmin):
|
||||||
list_display = ('title', 'category', 'publish_date', 'source', 'is_verified')
|
list_display = ('title', 'url', 'crawled')
|
||||||
list_filter = ('category', 'source', 'is_verified')
|
search_fields = ('title',)
|
||||||
search_fields = ('title', 'content')
|
|
||||||
|
|
||||||
@admin.register(SourceSite)
|
|
||||||
class SourceSiteAdmin(admin.ModelAdmin):
|
|
||||||
list_display = ('name', 'url', 'is_active')
|
|
||||||
search_fields = ('name', 'url')
|
|
||||||
|
|||||||
@@ -1,29 +1,32 @@
|
|||||||
# xinhua.py
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pytz # 确保已安装 pytz 库,用于处理时区
|
||||||
|
|
||||||
def parse_xinhua_article(url: str):
|
BASE_URL = "https://www.news.cn/"
|
||||||
"""
|
HEADERS = {
|
||||||
解析新华网文章页,返回 dict 包含标题与正文
|
"User-Agent": "Mozilla/5.0"
|
||||||
"""
|
}
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
def parse_xinhua_article(url: str, time_range_days: int = None):
|
||||||
|
"""
|
||||||
|
解析新华网文章页,返回 dict 包含标题与正文。
|
||||||
|
如果指定了 time_range_days,则只保留该时间范围内的文章。
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, headers=headers, timeout=10)
|
resp = requests.get(url, headers=HEADERS, timeout=10)
|
||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"❌ 请求失败:{e}")
|
print(f"❌ 请求失败:{e} URL: {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
# 提取标题
|
|
||||||
title_tag = soup.find("span", class_="title")
|
title_tag = soup.find("span", class_="title")
|
||||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||||
|
|
||||||
# 提取正文
|
|
||||||
content_tag = soup.find("span", id="detailContent")
|
content_tag = soup.find("span", id="detailContent")
|
||||||
if not content_tag:
|
if not content_tag:
|
||||||
print(f"❌ 没找到 detailContent: {url}")
|
print(f"❌ 没找到 detailContent: {url}")
|
||||||
@@ -32,26 +35,90 @@ def parse_xinhua_article(url: str):
|
|||||||
paragraphs = content_tag.find_all("p")
|
paragraphs = content_tag.find_all("p")
|
||||||
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
||||||
|
|
||||||
# 有效性校验
|
|
||||||
if len(content.strip()) < 50:
|
if len(content.strip()) < 50:
|
||||||
print(f"⚠️ 内容过短:{url}")
|
print(f"⚠️ 内容过短:{url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# 提取发布时间(假设格式为 YYYY-MM-DD)
|
||||||
|
publish_time_tag = soup.find("span", class_="publish-time")
|
||||||
|
if publish_time_tag:
|
||||||
|
publish_time_str = publish_time_tag.get_text(strip=True)
|
||||||
|
try:
|
||||||
|
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
|
||||||
|
except ValueError:
|
||||||
|
print(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
print(f"❌ 页面未找到发布时间:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 检查时间范围
|
||||||
|
if time_range_days is not None:
|
||||||
|
cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
|
||||||
|
if publish_time < cutoff_time:
|
||||||
|
print(f"⏰ 文章超出时间范围:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"title": title,
|
"title": title,
|
||||||
"content": content
|
"content": content,
|
||||||
|
"publish_time": publish_time
|
||||||
}
|
}
|
||||||
|
|
||||||
def crawl_xinhua_green():
|
def crawl_xinhua_green(time_range_days: int = None):
|
||||||
"""
|
"""
|
||||||
启动新华网绿色内容爬虫的入口函数
|
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
|
||||||
|
可选参数:time_range_days(仅爬取最近指定天数内的文章)
|
||||||
"""
|
"""
|
||||||
print("✅ 开始爬取新华网绿色内容...")
|
print("✅ 开始爬取新华网栏目列表...")
|
||||||
# 示例 URL,实际应根据需求进行扩展
|
channels = get_channel_urls()
|
||||||
test_url = "https://www.xinhua.net.cn/example-article"
|
print(f"共找到 {len(channels)} 个频道")
|
||||||
result = parse_xinhua_article(test_url)
|
|
||||||
if result:
|
all_articles = []
|
||||||
print("✅ 爬取成功:", result['title'])
|
|
||||||
else:
|
# 并发抓取每个频道的文章链接
|
||||||
print("❌ 爬取失败")
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
future_to_channel = {
|
||||||
|
executor.submit(get_article_urls_from_channel, ch_url): ch_url
|
||||||
|
for ch_url in channels
|
||||||
|
}
|
||||||
|
|
||||||
|
for future in as_completed(future_to_channel):
|
||||||
|
ch_url = future_to_channel[future]
|
||||||
|
try:
|
||||||
|
articles = future.result()
|
||||||
|
print(f"\n➡️ 抓取频道:{ch_url}")
|
||||||
|
print(f" 该频道找到 {len(articles)} 篇文章")
|
||||||
|
|
||||||
|
# 并发解析每篇文章
|
||||||
|
with ThreadPoolExecutor(max_workers=5) as article_executor:
|
||||||
|
article_futures = {
|
||||||
|
article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
|
||||||
|
for art_url in articles
|
||||||
|
}
|
||||||
|
|
||||||
|
for article_future in as_completed(article_futures):
|
||||||
|
article = article_future.result()
|
||||||
|
if article:
|
||||||
|
print(f" ✔️ 文章:{article['title']}")
|
||||||
|
# 更新或创建文章,并标记 crawled=True
|
||||||
|
Article.objects.update_or_create(
|
||||||
|
url=article['url'],
|
||||||
|
defaults={
|
||||||
|
'title': article['title'],
|
||||||
|
'content': article['content'],
|
||||||
|
'crawled': True # 标记为已爬取
|
||||||
|
}
|
||||||
|
)
|
||||||
|
all_articles.append(article)
|
||||||
|
else:
|
||||||
|
print(f" ❌ 文章解析失败:{article_futures[article_future]}")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
|
||||||
|
|
||||||
|
print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇")
|
||||||
|
return all_articles
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
|
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
|
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "抓取新华网绿色发展相关资料"
|
help = '爬取新华网文章并保存到数据库'
|
||||||
|
|
||||||
def handle(self, *args, **kwargs):
|
def handle(self, *args, **kwargs):
|
||||||
crawl_xinhua_green()
|
self.stdout.write("开始爬取...")
|
||||||
|
articles = crawl_xinhua_green()
|
||||||
|
self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章")
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
# Generated by Django 5.1 on 2025-07-21 12:52
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('collector', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RenameField(
|
||||||
|
model_name='article',
|
||||||
|
old_name='is_verified',
|
||||||
|
new_name='crawled',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='category',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='created_at',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='publish_date',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='source',
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='article',
|
||||||
|
name='title',
|
||||||
|
field=models.CharField(max_length=255),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -10,21 +10,10 @@ class SourceSite(models.Model):
|
|||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
class Article(models.Model):
|
class Article(models.Model):
|
||||||
CATEGORY_CHOICES = [
|
|
||||||
('政策', '政策'),
|
|
||||||
('案例', '案例'),
|
|
||||||
('新闻', '新闻'),
|
|
||||||
('科研', '科研'),
|
|
||||||
]
|
|
||||||
|
|
||||||
title = models.CharField(max_length=300)
|
|
||||||
url = models.URLField(unique=True)
|
url = models.URLField(unique=True)
|
||||||
publish_date = models.DateField(null=True, blank=True)
|
title = models.CharField(max_length=255)
|
||||||
content = models.TextField()
|
content = models.TextField()
|
||||||
category = models.CharField(max_length=100, choices=CATEGORY_CHOICES)
|
crawled = models.BooleanField(default=False) # 确保此字段存在
|
||||||
source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True)
|
|
||||||
created_at = models.DateTimeField(auto_now_add=True)
|
|
||||||
is_verified = models.BooleanField(default=False)
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
from django.urls import path
|
from django.urls import path
|
||||||
from . import views
|
from green_classroom.apps.collector import views
|
||||||
|
|
||||||
app_name = 'collector'
|
app_name = 'collector' # 确保命名空间正确
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('', views.article_list, name='article_list'),
|
path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
|
||||||
path('<int:pk>/', views.article_detail, name='article_detail'),
|
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||||
]
|
path('articles/', views.list_articles, name='article_list'), # 添加这一行
|
||||||
|
]
|
||||||
@@ -1,20 +1,51 @@
|
|||||||
from django.shortcuts import render, get_object_or_404
|
from django.shortcuts import render, get_object_or_404
|
||||||
from .models import Article
|
from django.core.management import call_command
|
||||||
from django.db.models import Q
|
from green_classroom.apps.collector.models import Article
|
||||||
|
import os
|
||||||
|
from django.conf import settings
|
||||||
|
from django.template import TemplateDoesNotExist
|
||||||
|
|
||||||
def article_list(request):
|
def list_articles(request):
|
||||||
query = request.GET.get('q', '')
|
"""
|
||||||
articles = Article.objects.filter(is_verified=True)
|
展示所有文章的视图
|
||||||
|
"""
|
||||||
|
articles = Article.objects.all()
|
||||||
|
return render(request, 'collector/article_list.html', {'articles': articles})
|
||||||
|
|
||||||
if query:
|
def article_detail(request, article_id):
|
||||||
articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query))
|
"""
|
||||||
|
展示单篇文章的详细内容
|
||||||
|
"""
|
||||||
|
article = get_object_or_404(Article, id=article_id)
|
||||||
|
return render(request, 'collector/article_detail.html', {'article': article})
|
||||||
|
|
||||||
return render(request, 'collector/article_list.html', {
|
def run_crawler(request):
|
||||||
'articles': articles.order_by('-publish_date')[:100]
|
result = []
|
||||||
})
|
if request.method == 'POST':
|
||||||
|
# 调用爬虫命令并获取输出
|
||||||
|
from io import StringIO
|
||||||
|
output = StringIO()
|
||||||
|
call_command('crawl_xinhua', stdout=output)
|
||||||
|
result.append(output.getvalue())
|
||||||
|
|
||||||
|
# 调试:打印模板路径
|
||||||
|
template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
|
||||||
|
print(f"🔍 正在查找模板文件:{template_path}")
|
||||||
|
|
||||||
|
# 调试:检查模板是否存在
|
||||||
|
try:
|
||||||
|
with open(template_path, 'r', encoding='utf-8') as f:
|
||||||
|
print("✅ 模板文件存在")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("❌ 模板文件不存在,请检查路径")
|
||||||
|
|
||||||
|
return render(request, 'collector/run_crawler.html', {'output': result})
|
||||||
|
|
||||||
def article_detail(request, pk):
|
def delete_all_articles(request):
|
||||||
article = get_object_or_404(Article, pk=pk, is_verified=True)
|
"""
|
||||||
return render(request, 'collector/article_detail.html', {
|
删除所有文章的视图
|
||||||
'article': article
|
"""
|
||||||
})
|
if request.method == 'POST':
|
||||||
|
Article.objects.all().delete()
|
||||||
|
return redirect('collector:article_list')
|
||||||
|
return render(request, 'collector/delete_all_articles.html')
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
|
||||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
@@ -55,7 +56,7 @@ ROOT_URLCONF = 'green_classroom.urls'
|
|||||||
TEMPLATES = [
|
TEMPLATES = [
|
||||||
{
|
{
|
||||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
'DIRS': [],
|
'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在
|
||||||
'APP_DIRS': True,
|
'APP_DIRS': True,
|
||||||
'OPTIONS': {
|
'OPTIONS': {
|
||||||
'context_processors': [
|
'context_processors': [
|
||||||
@@ -122,3 +123,6 @@ STATIC_URL = 'static/'
|
|||||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||||
|
|
||||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
|
# 增加最大字段数限制,适应 admin 页面大数据量展示
|
||||||
|
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
||||||
|
|||||||
11
src/green_classroom/templates/collector/article_detail.html
Normal file
11
src/green_classroom/templates/collector/article_detail.html
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>{{ article.title }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{{ article.title }}</h1>
|
||||||
|
<p>{{ article.content }}</p>
|
||||||
|
<a href="{% url 'collector:article_list' %}">返回文章列表</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
19
src/green_classroom/templates/collector/article_list.html
Normal file
19
src/green_classroom/templates/collector/article_list.html
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>文章列表</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>所有文章</h1>
|
||||||
|
<ul>
|
||||||
|
{% for article in articles %}
|
||||||
|
<li>
|
||||||
|
<a href="{% url 'collector:article_detail' article_id=article.id %}">
|
||||||
|
<strong>{{ article.title }}</strong>
|
||||||
|
</a><br>
|
||||||
|
{{ article.content|truncatewords:50 }}
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>删除所有文章</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>确认删除所有文章</h1>
|
||||||
|
<p>您确定要删除所有文章吗?此操作不可撤销。</p>
|
||||||
|
<form method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit">删除所有文章</button>
|
||||||
|
</form>
|
||||||
|
<a href="{% url 'collector:article_list' %}">取消</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>运行爬虫</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>运行爬虫</h1>
|
||||||
|
<form method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit">开始爬取</button>
|
||||||
|
</form>
|
||||||
|
{% if output %}
|
||||||
|
<h2>输出:</h2>
|
||||||
|
<pre>{{ output }}</pre>
|
||||||
|
{% endif %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
from django.contrib import admin
|
|
||||||
from django.urls import path, include
|
from django.urls import path, include
|
||||||
|
from django.contrib import admin # 新增导入
|
||||||
|
from green_classroom.apps.collector import views
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('admin/', admin.site.urls),
|
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
|
||||||
path('collector/', include('green_classroom.apps.collector.urls')),
|
path('articles/', views.list_articles, name='article_list'),
|
||||||
]
|
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user