Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d95a1fd5fb |
@@ -3,13 +3,27 @@ from bs4 import BeautifulSoup
|
|||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import pytz # 确保已安装 pytz 库,用于处理时区
|
import pytz
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 配置logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
BASE_URL = "https://www.news.cn/"
|
BASE_URL = "https://www.news.cn/"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0"
|
"User-Agent": "Mozilla/5.0"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_xinhua_article(url: str, time_range_days: int = None):
|
def parse_xinhua_article(url: str, time_range_days: int = None):
|
||||||
"""
|
"""
|
||||||
解析新华网文章页,返回 dict 包含标题与正文。
|
解析新华网文章页,返回 dict 包含标题与正文。
|
||||||
@@ -19,7 +33,7 @@ def parse_xinhua_article(url: str, time_range_days: int = None):
|
|||||||
resp = requests.get(url, headers=HEADERS, timeout=10)
|
resp = requests.get(url, headers=HEADERS, timeout=10)
|
||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"❌ 请求失败:{e} URL: {url}")
|
logger.error(f"❌ 请求失败:{e} URL: {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
@@ -29,14 +43,14 @@ def parse_xinhua_article(url: str, time_range_days: int = None):
|
|||||||
|
|
||||||
content_tag = soup.find("span", id="detailContent")
|
content_tag = soup.find("span", id="detailContent")
|
||||||
if not content_tag:
|
if not content_tag:
|
||||||
print(f"❌ 没找到 detailContent: {url}")
|
logger.error(f"❌ 没找到 detailContent: {url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
paragraphs = content_tag.find_all("p")
|
paragraphs = content_tag.find_all("p")
|
||||||
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
||||||
|
|
||||||
if len(content.strip()) < 50:
|
if len(content.strip()) < 50:
|
||||||
print(f"⚠️ 内容过短:{url}")
|
logger.warning(f"⚠️ 内容过短:{url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 提取发布时间(假设格式为 YYYY-MM-DD)
|
# 提取发布时间(假设格式为 YYYY-MM-DD)
|
||||||
@@ -46,18 +60,19 @@ def parse_xinhua_article(url: str, time_range_days: int = None):
|
|||||||
try:
|
try:
|
||||||
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
|
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
|
logger.error(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
print(f"❌ 页面未找到发布时间:{url}")
|
logger.error(f"❌ 页面未找到发布时间:{url}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 检查时间范围
|
# 确保 title 和 content 为 UTF-8 编码的字符串
|
||||||
if time_range_days is not None:
|
try:
|
||||||
cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
|
title = title.encode('utf-8').decode('utf-8')
|
||||||
if publish_time < cutoff_time:
|
content = content.encode('utf-8').decode('utf-8')
|
||||||
print(f"⏰ 文章超出时间范围:{url}")
|
except UnicodeDecodeError:
|
||||||
return None
|
logger.error(f"❌ 字符编码错误,无法正确解码标题或内容:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
@@ -66,30 +81,31 @@ def parse_xinhua_article(url: str, time_range_days: int = None):
|
|||||||
"publish_time": publish_time
|
"publish_time": publish_time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def crawl_xinhua_green(time_range_days: int = None):
|
def crawl_xinhua_green(time_range_days: int = None):
|
||||||
"""
|
"""
|
||||||
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
|
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
|
||||||
可选参数:time_range_days(仅爬取最近指定天数内的文章)
|
可选参数:time_range_days(仅爬取最近指定天数内的文章)
|
||||||
"""
|
"""
|
||||||
print("✅ 开始爬取新华网栏目列表...")
|
logger.info("✅ 开始爬取新华网栏目列表...")
|
||||||
channels = get_channel_urls()
|
channels = get_channel_urls()
|
||||||
print(f"共找到 {len(channels)} 个频道")
|
logger.info(f"共找到 {len(channels)} 个频道")
|
||||||
|
|
||||||
all_articles = []
|
all_articles = []
|
||||||
|
|
||||||
# 并发抓取每个频道的文章链接
|
# 并发抓取每个频道的文章链接
|
||||||
with ThreadPoolExecutor(max_workers=5) as executor:
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
future_to_channel = {
|
future_to_channel = {
|
||||||
executor.submit(get_article_urls_from_channel, ch_url): ch_url
|
executor.submit(get_article_urls_from_channel, ch_url): ch_url
|
||||||
for ch_url in channels
|
for ch_url in channels
|
||||||
}
|
}
|
||||||
|
|
||||||
for future in as_completed(future_to_channel):
|
for future in as_completed(future_to_channel):
|
||||||
ch_url = future_to_channel[future]
|
ch_url = future_to_channel[future]
|
||||||
try:
|
try:
|
||||||
articles = future.result()
|
articles = future.result()
|
||||||
print(f"\n➡️ 抓取频道:{ch_url}")
|
logger.info(f"\n➡️ 抓取频道:{ch_url}")
|
||||||
print(f" 该频道找到 {len(articles)} 篇文章")
|
logger.info(f" 该频道找到 {len(articles)} 篇文章")
|
||||||
|
|
||||||
# 并发解析每篇文章
|
# 并发解析每篇文章
|
||||||
with ThreadPoolExecutor(max_workers=5) as article_executor:
|
with ThreadPoolExecutor(max_workers=5) as article_executor:
|
||||||
@@ -101,7 +117,7 @@ def crawl_xinhua_green(time_range_days: int = None):
|
|||||||
for article_future in as_completed(article_futures):
|
for article_future in as_completed(article_futures):
|
||||||
article = article_future.result()
|
article = article_future.result()
|
||||||
if article:
|
if article:
|
||||||
print(f" ✔️ 文章:{article['title']}")
|
logger.info(f" ✔️ 文章:{article['title']}")
|
||||||
# 更新或创建文章,并标记 crawled=True
|
# 更新或创建文章,并标记 crawled=True
|
||||||
Article.objects.update_or_create(
|
Article.objects.update_or_create(
|
||||||
url=article['url'],
|
url=article['url'],
|
||||||
@@ -113,12 +129,44 @@ def crawl_xinhua_green(time_range_days: int = None):
|
|||||||
)
|
)
|
||||||
all_articles.append(article)
|
all_articles.append(article)
|
||||||
else:
|
else:
|
||||||
print(f" ❌ 文章解析失败:{article_futures[article_future]}")
|
logger.error(f" ❌ 文章解析失败:{article_futures[article_future]}")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
|
logger.error(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
|
||||||
|
|
||||||
print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇")
|
logger.info(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇")
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
logger.info("开始爬取...")
|
||||||
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章
|
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章
|
||||||
|
logger.info("爬取完成")
|
||||||
|
|
||||||
|
|
||||||
|
from django.http import JsonResponse
|
||||||
|
|
||||||
|
|
||||||
|
def export_articles(request):
|
||||||
|
"""
|
||||||
|
导出所有爬取的文章为 JSON 格式。
|
||||||
|
可通过访问 /export/xinhua-article/ 触发
|
||||||
|
"""
|
||||||
|
time_range_days = request.GET.get('time_range_days', None)
|
||||||
|
if time_range_days is not None:
|
||||||
|
try:
|
||||||
|
time_range_days = int(time_range_days)
|
||||||
|
except ValueError:
|
||||||
|
logger.error("❌ 无效的时间范围参数")
|
||||||
|
return JsonResponse({"error": "无效的时间范围参数"}, status=400)
|
||||||
|
|
||||||
|
logger.info(f"开始导出文章,时间范围:{time_range_days} 天")
|
||||||
|
# 获取文章数据
|
||||||
|
articles = crawl_xinhua_green(time_range_days=time_range_days)
|
||||||
|
|
||||||
|
logger.info(f"成功导出 {len(articles)} 篇文章")
|
||||||
|
# 返回 JSON 响应,并确保中文不被转义
|
||||||
|
return JsonResponse(
|
||||||
|
articles,
|
||||||
|
safe=False,
|
||||||
|
json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')} # 减少空格,更兼容 Safari
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 5.1 on 2025-07-22 07:53
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('collector', '0002_rename_is_verified_article_crawled_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='article',
|
||||||
|
name='publish_time',
|
||||||
|
field=models.DateTimeField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -14,6 +14,7 @@ class Article(models.Model):
|
|||||||
title = models.CharField(max_length=255)
|
title = models.CharField(max_length=255)
|
||||||
content = models.TextField()
|
content = models.TextField()
|
||||||
crawled = models.BooleanField(default=False) # 确保此字段存在
|
crawled = models.BooleanField(default=False) # 确保此字段存在
|
||||||
|
publish_time = models.DateTimeField(null=True, blank=True) # 添加 publish_time 字段
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.title
|
return self.title
|
||||||
|
|||||||
@@ -20,5 +20,42 @@
|
|||||||
<li>暂无内容</li>
|
<li>暂无内容</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<input type="number" id="timeRange" placeholder="时间范围(天)" />
|
||||||
|
<button onclick="exportArticles()">导出文章</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function exportArticles() {
|
||||||
|
const timeRange = document.getElementById('timeRange').value;
|
||||||
|
let exportUrl = '/collector/export/xinhua-article/';
|
||||||
|
|
||||||
|
if (timeRange) {
|
||||||
|
exportUrl += `?time_range_days=${timeRange}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 发起请求并触发下载
|
||||||
|
fetch(exportUrl)
|
||||||
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('导出失败');
|
||||||
|
}
|
||||||
|
return response.blob();
|
||||||
|
})
|
||||||
|
.then(blob => {
|
||||||
|
const downloadUrl = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.href = downloadUrl;
|
||||||
|
a.download = 'articles.json';
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
a.remove();
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
alert(error);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -1,10 +1,8 @@
|
|||||||
from django.urls import path
|
app_name = 'collector' # 添加 app_name 定义命名空间
|
||||||
from green_classroom.apps.collector import views
|
|
||||||
|
|
||||||
app_name = 'collector' # 确保命名空间正确
|
from django.urls import path
|
||||||
|
from . import views
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
|
path('article/<int:pk>/', views.article_detail, name='article_detail'),
|
||||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
]
|
||||||
path('articles/', views.list_articles, name='article_list'), # 添加这一行
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -1,3 +1,29 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 配置logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
from django.shortcuts import render
|
||||||
|
from django.http import HttpResponse
|
||||||
|
from django.http import JsonResponse
|
||||||
|
from .models import Article # 假设 Article 是你的模型
|
||||||
|
|
||||||
|
def index(request):
|
||||||
|
"""
|
||||||
|
主页视图,显示欢迎信息或文章列表。
|
||||||
|
"""
|
||||||
|
return HttpResponse("欢迎来到绿色课堂资料库!")
|
||||||
|
|
||||||
from django.shortcuts import render, get_object_or_404
|
from django.shortcuts import render, get_object_or_404
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
from green_classroom.apps.collector.models import Article
|
from green_classroom.apps.collector.models import Article
|
||||||
@@ -12,12 +38,25 @@ def list_articles(request):
|
|||||||
articles = Article.objects.all()
|
articles = Article.objects.all()
|
||||||
return render(request, 'collector/article_list.html', {'articles': articles})
|
return render(request, 'collector/article_list.html', {'articles': articles})
|
||||||
|
|
||||||
def article_detail(request, article_id):
|
def article_detail(request, pk):
|
||||||
"""
|
"""
|
||||||
展示单篇文章的详细内容
|
根据主键 pk 获取文章详情。
|
||||||
"""
|
"""
|
||||||
article = get_object_or_404(Article, id=article_id)
|
try:
|
||||||
return render(request, 'collector/article_detail.html', {'article': article})
|
article = Article.objects.get(pk=pk)
|
||||||
|
logger.info(f"返回文章标题: {article.title}") # 添加日志确认标题是否正常
|
||||||
|
logger.info(f"返回文章内容: {article.content[:100]}...") # 输出前100字符确认内容是否正常
|
||||||
|
data = {
|
||||||
|
'id': article.id,
|
||||||
|
'url': article.url,
|
||||||
|
'title': str(article.title), # 强制转换为 str,避免潜在的编码问题
|
||||||
|
'content': str(article.content), # 强制转换为 str
|
||||||
|
'publish_time': article.publish_time.isoformat() if article.publish_time else None, # 格式化时间
|
||||||
|
'crawled': article.crawled
|
||||||
|
}
|
||||||
|
return JsonResponse(data, json_dumps_params={'ensure_ascii': False})
|
||||||
|
except Article.DoesNotExist:
|
||||||
|
return JsonResponse({'error': '文章不存在'}, status=404)
|
||||||
|
|
||||||
def run_crawler(request):
|
def run_crawler(request):
|
||||||
result = []
|
result = []
|
||||||
|
|||||||
@@ -25,8 +25,9 @@ SECRET_KEY = 'django-insecure-mi#9dyl0zwanl2=uziz3om_t**ovk08+pg127^+=5m=s6^+(@b
|
|||||||
|
|
||||||
# SECURITY WARNING: don't run with debug turned on in production!
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
|
#DEBUG = False
|
||||||
|
|
||||||
ALLOWED_HOSTS = []
|
ALLOWED_HOSTS = ['*',]
|
||||||
|
|
||||||
|
|
||||||
# Application definition
|
# Application definition
|
||||||
@@ -126,3 +127,14 @@ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
|||||||
|
|
||||||
# 增加最大字段数限制,适应 admin 页面大数据量展示
|
# 增加最大字段数限制,适应 admin 页面大数据量展示
|
||||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
||||||
|
|
||||||
|
|
||||||
|
import sentry_sdk
|
||||||
|
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn="https://a976759c113a1e07050c61fb4dfe16bc@sentry.yuangyaa.com/2",
|
||||||
|
# Add data like request headers and IP for users,
|
||||||
|
# see https://docs.sentry.io/platforms/python/data-management/data-collected/ for more info
|
||||||
|
send_default_pii=True,
|
||||||
|
environment="staging", # 或其他你设置的环境名
|
||||||
|
)
|
||||||
|
|||||||
@@ -6,4 +6,5 @@ urlpatterns = [
|
|||||||
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
|
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
|
||||||
path('articles/', views.list_articles, name='article_list'),
|
path('articles/', views.list_articles, name='article_list'),
|
||||||
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
|
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user