Compare commits
9 Commits
149c051dab
...
0.1
| Author | SHA1 | Date | |
|---|---|---|---|
| d95a1fd5fb | |||
| 61688f4bff | |||
| c750d77eab | |||
| babba5b923 | |||
| b273f22bba | |||
| 95efe66ad4 | |||
| 5b31b3ef3e | |||
| 3ecf847039 | |||
| e9ac04da60 |
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# 默认忽略的文件
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# 基于编辑器的 HTTP 客户端请求
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
27
.idea/green_classroom.iml
generated
Normal file
27
.idea/green_classroom.iml
generated
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="FacetManager">
|
||||||
|
<facet type="django" name="Django">
|
||||||
|
<configuration>
|
||||||
|
<option name="rootFolder" value="$MODULE_DIR$/src" />
|
||||||
|
<option name="settingsModule" value="green_classroom/settings.py" />
|
||||||
|
<option name="manageScript" value="$MODULE_DIR$/src/manage.py" />
|
||||||
|
<option name="environment" value="<map/>" />
|
||||||
|
<option name="doNotUseTestRunner" value="false" />
|
||||||
|
<option name="trackFilePattern" value="migrations" />
|
||||||
|
</configuration>
|
||||||
|
</facet>
|
||||||
|
</component>
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.12 (green_classroom)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="PyDocumentationSettings">
|
||||||
|
<option name="format" value="PLAIN" />
|
||||||
|
<option name="myDocStringFormat" value="Plain" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.12 (green_classroom)" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (green_classroom)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/green_classroom.iml" filepath="$PROJECT_DIR$/.idea/green_classroom.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
# green_classroom
|
# green_classroom
|
||||||
|
|
||||||
行政办 绿色课堂
|
行政办 绿色课堂
|
||||||
|
# init projects
|
||||||
|
|||||||
1
develop.txt
Normal file
1
develop.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# 開發分支
|
||||||
6
requirements.in
Normal file
6
requirements.in
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
uv==0.7.22
|
||||||
|
django==5.1
|
||||||
|
psycopg2-binary
|
||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
ipython
|
||||||
70
requirements.txt
Normal file
70
requirements.txt
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with Python 3.12
|
||||||
|
# by the following command:
|
||||||
|
#
|
||||||
|
# pip-compile requirements.in
|
||||||
|
#
|
||||||
|
--index-url https://pypi.nicestudio.com.tw/root/pypi
|
||||||
|
|
||||||
|
asgiref==3.9.1
|
||||||
|
# via django
|
||||||
|
asttokens==3.0.0
|
||||||
|
# via stack-data
|
||||||
|
beautifulsoup4==4.13.4
|
||||||
|
# via -r requirements.in
|
||||||
|
certifi==2025.7.14
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==3.4.2
|
||||||
|
# via requests
|
||||||
|
decorator==5.2.1
|
||||||
|
# via ipython
|
||||||
|
django==5.1
|
||||||
|
# via -r requirements.in
|
||||||
|
executing==2.2.0
|
||||||
|
# via stack-data
|
||||||
|
idna==3.10
|
||||||
|
# via requests
|
||||||
|
ipython==9.4.0
|
||||||
|
# via -r requirements.in
|
||||||
|
ipython-pygments-lexers==1.1.1
|
||||||
|
# via ipython
|
||||||
|
jedi==0.19.2
|
||||||
|
# via ipython
|
||||||
|
matplotlib-inline==0.1.7
|
||||||
|
# via ipython
|
||||||
|
parso==0.8.4
|
||||||
|
# via jedi
|
||||||
|
pexpect==4.9.0
|
||||||
|
# via ipython
|
||||||
|
prompt-toolkit==3.0.51
|
||||||
|
# via ipython
|
||||||
|
psycopg2-binary==2.9.10
|
||||||
|
# via -r requirements.in
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
# via pexpect
|
||||||
|
pure-eval==0.2.3
|
||||||
|
# via stack-data
|
||||||
|
pygments==2.19.2
|
||||||
|
# via
|
||||||
|
# ipython
|
||||||
|
# ipython-pygments-lexers
|
||||||
|
requests==2.32.4
|
||||||
|
# via -r requirements.in
|
||||||
|
soupsieve==2.7
|
||||||
|
# via beautifulsoup4
|
||||||
|
sqlparse==0.5.3
|
||||||
|
# via django
|
||||||
|
stack-data==0.6.3
|
||||||
|
# via ipython
|
||||||
|
traitlets==5.14.3
|
||||||
|
# via
|
||||||
|
# ipython
|
||||||
|
# matplotlib-inline
|
||||||
|
typing-extensions==4.14.1
|
||||||
|
# via beautifulsoup4
|
||||||
|
urllib3==2.5.0
|
||||||
|
# via requests
|
||||||
|
uv==0.7.22
|
||||||
|
# via -r requirements.in
|
||||||
|
wcwidth==0.2.13
|
||||||
|
# via prompt-toolkit
|
||||||
0
src/green_classroom/__init__.py
Normal file
0
src/green_classroom/__init__.py
Normal file
0
src/green_classroom/apps/collector/__init__.py
Normal file
0
src/green_classroom/apps/collector/__init__.py
Normal file
7
src/green_classroom/apps/collector/admin.py
Normal file
7
src/green_classroom/apps/collector/admin.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
from .models import Article
|
||||||
|
|
||||||
|
@admin.register(Article)
|
||||||
|
class ArticleAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ('title', 'url', 'crawled')
|
||||||
|
search_fields = ('title',)
|
||||||
6
src/green_classroom/apps/collector/apps.py
Normal file
6
src/green_classroom/apps/collector/apps.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class CollectorConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
name = 'green_classroom.apps.collector' # 修改为完整路径
|
||||||
172
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
172
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pytz
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 配置logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BASE_URL = "https://www.news.cn/"
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xinhua_article(url: str, time_range_days: int = None):
|
||||||
|
"""
|
||||||
|
解析新华网文章页,返回 dict 包含标题与正文。
|
||||||
|
如果指定了 time_range_days,则只保留该时间范围内的文章。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, headers=HEADERS, timeout=10)
|
||||||
|
resp.encoding = 'utf-8'
|
||||||
|
except requests.RequestException as e:
|
||||||
|
logger.error(f"❌ 请求失败:{e} URL: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
title_tag = soup.find("span", class_="title")
|
||||||
|
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||||
|
|
||||||
|
content_tag = soup.find("span", id="detailContent")
|
||||||
|
if not content_tag:
|
||||||
|
logger.error(f"❌ 没找到 detailContent: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
paragraphs = content_tag.find_all("p")
|
||||||
|
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
||||||
|
|
||||||
|
if len(content.strip()) < 50:
|
||||||
|
logger.warning(f"⚠️ 内容过短:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 提取发布时间(假设格式为 YYYY-MM-DD)
|
||||||
|
publish_time_tag = soup.find("span", class_="publish-time")
|
||||||
|
if publish_time_tag:
|
||||||
|
publish_time_str = publish_time_tag.get_text(strip=True)
|
||||||
|
try:
|
||||||
|
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
|
||||||
|
except ValueError:
|
||||||
|
logger.error(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logger.error(f"❌ 页面未找到发布时间:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 确保 title 和 content 为 UTF-8 编码的字符串
|
||||||
|
try:
|
||||||
|
title = title.encode('utf-8').decode('utf-8')
|
||||||
|
content = content.encode('utf-8').decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logger.error(f"❌ 字符编码错误,无法正确解码标题或内容:{url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": title,
|
||||||
|
"content": content,
|
||||||
|
"publish_time": publish_time
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_xinhua_green(time_range_days: int = None):
|
||||||
|
"""
|
||||||
|
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
|
||||||
|
可选参数:time_range_days(仅爬取最近指定天数内的文章)
|
||||||
|
"""
|
||||||
|
logger.info("✅ 开始爬取新华网栏目列表...")
|
||||||
|
channels = get_channel_urls()
|
||||||
|
logger.info(f"共找到 {len(channels)} 个频道")
|
||||||
|
|
||||||
|
all_articles = []
|
||||||
|
|
||||||
|
# 并发抓取每个频道的文章链接
|
||||||
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
future_to_channel = {
|
||||||
|
executor.submit(get_article_urls_from_channel, ch_url): ch_url
|
||||||
|
for ch_url in channels
|
||||||
|
}
|
||||||
|
|
||||||
|
for future in as_completed(future_to_channel):
|
||||||
|
ch_url = future_to_channel[future]
|
||||||
|
try:
|
||||||
|
articles = future.result()
|
||||||
|
logger.info(f"\n➡️ 抓取频道:{ch_url}")
|
||||||
|
logger.info(f" 该频道找到 {len(articles)} 篇文章")
|
||||||
|
|
||||||
|
# 并发解析每篇文章
|
||||||
|
with ThreadPoolExecutor(max_workers=5) as article_executor:
|
||||||
|
article_futures = {
|
||||||
|
article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
|
||||||
|
for art_url in articles
|
||||||
|
}
|
||||||
|
|
||||||
|
for article_future in as_completed(article_futures):
|
||||||
|
article = article_future.result()
|
||||||
|
if article:
|
||||||
|
logger.info(f" ✔️ 文章:{article['title']}")
|
||||||
|
# 更新或创建文章,并标记 crawled=True
|
||||||
|
Article.objects.update_or_create(
|
||||||
|
url=article['url'],
|
||||||
|
defaults={
|
||||||
|
'title': article['title'],
|
||||||
|
'content': article['content'],
|
||||||
|
'crawled': True # 标记为已爬取
|
||||||
|
}
|
||||||
|
)
|
||||||
|
all_articles.append(article)
|
||||||
|
else:
|
||||||
|
logger.error(f" ❌ 文章解析失败:{article_futures[article_future]}")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
|
||||||
|
|
||||||
|
logger.info(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇")
|
||||||
|
return all_articles
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info("开始爬取...")
|
||||||
|
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章
|
||||||
|
logger.info("爬取完成")
|
||||||
|
|
||||||
|
|
||||||
|
from django.http import JsonResponse
|
||||||
|
|
||||||
|
|
||||||
|
def export_articles(request):
|
||||||
|
"""
|
||||||
|
导出所有爬取的文章为 JSON 格式。
|
||||||
|
可通过访问 /export/xinhua-article/ 触发
|
||||||
|
"""
|
||||||
|
time_range_days = request.GET.get('time_range_days', None)
|
||||||
|
if time_range_days is not None:
|
||||||
|
try:
|
||||||
|
time_range_days = int(time_range_days)
|
||||||
|
except ValueError:
|
||||||
|
logger.error("❌ 无效的时间范围参数")
|
||||||
|
return JsonResponse({"error": "无效的时间范围参数"}, status=400)
|
||||||
|
|
||||||
|
logger.info(f"开始导出文章,时间范围:{time_range_days} 天")
|
||||||
|
# 获取文章数据
|
||||||
|
articles = crawl_xinhua_green(time_range_days=time_range_days)
|
||||||
|
|
||||||
|
logger.info(f"成功导出 {len(articles)} 篇文章")
|
||||||
|
# 返回 JSON 响应,并确保中文不被转义
|
||||||
|
return JsonResponse(
|
||||||
|
articles,
|
||||||
|
safe=False,
|
||||||
|
json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')} # 减少空格,更兼容 Safari
|
||||||
|
)
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = '爬取新华网文章并保存到数据库'
|
||||||
|
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
self.stdout.write("开始爬取...")
|
||||||
|
articles = crawl_xinhua_green()
|
||||||
|
self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章")
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
# Generated by Django 5.1 on 2025-07-21 12:00
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='SourceSite',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('name', models.CharField(max_length=100)),
|
||||||
|
('url', models.URLField()),
|
||||||
|
('is_active', models.BooleanField(default=True)),
|
||||||
|
('remarks', models.TextField(blank=True)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='Article',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('title', models.CharField(max_length=300)),
|
||||||
|
('url', models.URLField(unique=True)),
|
||||||
|
('publish_date', models.DateField(blank=True, null=True)),
|
||||||
|
('content', models.TextField()),
|
||||||
|
('category', models.CharField(choices=[('政策', '政策'), ('案例', '案例'), ('新闻', '新闻'), ('科研', '科研')], max_length=100)),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('is_verified', models.BooleanField(default=False)),
|
||||||
|
('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='collector.sourcesite')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
# Generated by Django 5.1 on 2025-07-21 12:52
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('collector', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RenameField(
|
||||||
|
model_name='article',
|
||||||
|
old_name='is_verified',
|
||||||
|
new_name='crawled',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='category',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='created_at',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='publish_date',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='article',
|
||||||
|
name='source',
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='article',
|
||||||
|
name='title',
|
||||||
|
field=models.CharField(max_length=255),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 5.1 on 2025-07-22 07:53
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('collector', '0002_rename_is_verified_article_crawled_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='article',
|
||||||
|
name='publish_time',
|
||||||
|
field=models.DateTimeField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
||||||
20
src/green_classroom/apps/collector/models.py
Normal file
20
src/green_classroom/apps/collector/models.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from django.db import models
|
||||||
|
|
||||||
|
class SourceSite(models.Model):
|
||||||
|
name = models.CharField(max_length=100)
|
||||||
|
url = models.URLField()
|
||||||
|
is_active = models.BooleanField(default=True)
|
||||||
|
remarks = models.TextField(blank=True)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
class Article(models.Model):
|
||||||
|
url = models.URLField(unique=True)
|
||||||
|
title = models.CharField(max_length=255)
|
||||||
|
content = models.TextField()
|
||||||
|
crawled = models.BooleanField(default=False) # 确保此字段存在
|
||||||
|
publish_time = models.DateTimeField(null=True, blank=True) # 添加 publish_time 字段
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.title
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>{{ article.title }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{{ article.title }}</h1>
|
||||||
|
<p><strong>来源:</strong>{{ article.source.name }}</p>
|
||||||
|
<p><strong>分类:</strong>{{ article.category }}</p>
|
||||||
|
<p><strong>发布时间:</strong>{{ article.publish_date }}</p>
|
||||||
|
<div>
|
||||||
|
{{ article.content|linebreaks }}
|
||||||
|
</div>
|
||||||
|
<p><a href="{% url 'collector:article_list' %}">返回列表</a></p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>绿色课堂资料库</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>绿色课堂资料库</h1>
|
||||||
|
<form method="get">
|
||||||
|
<input type="text" name="q" placeholder="关键词搜索..." value="{{ request.GET.q }}">
|
||||||
|
<button type="submit">搜索</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
{% for article in articles %}
|
||||||
|
<li>
|
||||||
|
<a href="{% url 'collector:article_detail' article.id %}">{{ article.title }}</a>
|
||||||
|
({{ article.category }} | {{ article.publish_date }})
|
||||||
|
</li>
|
||||||
|
{% empty %}
|
||||||
|
<li>暂无内容</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<input type="number" id="timeRange" placeholder="时间范围(天)" />
|
||||||
|
<button onclick="exportArticles()">导出文章</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function exportArticles() {
|
||||||
|
const timeRange = document.getElementById('timeRange').value;
|
||||||
|
let exportUrl = '/collector/export/xinhua-article/';
|
||||||
|
|
||||||
|
if (timeRange) {
|
||||||
|
exportUrl += `?time_range_days=${timeRange}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 发起请求并触发下载
|
||||||
|
fetch(exportUrl)
|
||||||
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error('导出失败');
|
||||||
|
}
|
||||||
|
return response.blob();
|
||||||
|
})
|
||||||
|
.then(blob => {
|
||||||
|
const downloadUrl = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.href = downloadUrl;
|
||||||
|
a.download = 'articles.json';
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
a.remove();
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
alert(error);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
3
src/green_classroom/apps/collector/tests.py
Normal file
3
src/green_classroom/apps/collector/tests.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
# Create your tests here.
|
||||||
8
src/green_classroom/apps/collector/urls.py
Normal file
8
src/green_classroom/apps/collector/urls.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
app_name = 'collector' # 添加 app_name 定义命名空间
|
||||||
|
|
||||||
|
from django.urls import path
|
||||||
|
from . import views
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('article/<int:pk>/', views.article_detail, name='article_detail'),
|
||||||
|
]
|
||||||
90
src/green_classroom/apps/collector/views.py
Normal file
90
src/green_classroom/apps/collector/views.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 配置logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(sys.stdout)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
from django.shortcuts import render
|
||||||
|
from django.http import HttpResponse
|
||||||
|
from django.http import JsonResponse
|
||||||
|
from .models import Article # 假设 Article 是你的模型
|
||||||
|
|
||||||
|
def index(request):
|
||||||
|
"""
|
||||||
|
主页视图,显示欢迎信息或文章列表。
|
||||||
|
"""
|
||||||
|
return HttpResponse("欢迎来到绿色课堂资料库!")
|
||||||
|
|
||||||
|
from django.shortcuts import render, get_object_or_404
|
||||||
|
from django.core.management import call_command
|
||||||
|
from green_classroom.apps.collector.models import Article
|
||||||
|
import os
|
||||||
|
from django.conf import settings
|
||||||
|
from django.template import TemplateDoesNotExist
|
||||||
|
|
||||||
|
def list_articles(request):
|
||||||
|
"""
|
||||||
|
展示所有文章的视图
|
||||||
|
"""
|
||||||
|
articles = Article.objects.all()
|
||||||
|
return render(request, 'collector/article_list.html', {'articles': articles})
|
||||||
|
|
||||||
|
def article_detail(request, pk):
|
||||||
|
"""
|
||||||
|
根据主键 pk 获取文章详情。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
article = Article.objects.get(pk=pk)
|
||||||
|
logger.info(f"返回文章标题: {article.title}") # 添加日志确认标题是否正常
|
||||||
|
logger.info(f"返回文章内容: {article.content[:100]}...") # 输出前100字符确认内容是否正常
|
||||||
|
data = {
|
||||||
|
'id': article.id,
|
||||||
|
'url': article.url,
|
||||||
|
'title': str(article.title), # 强制转换为 str,避免潜在的编码问题
|
||||||
|
'content': str(article.content), # 强制转换为 str
|
||||||
|
'publish_time': article.publish_time.isoformat() if article.publish_time else None, # 格式化时间
|
||||||
|
'crawled': article.crawled
|
||||||
|
}
|
||||||
|
return JsonResponse(data, json_dumps_params={'ensure_ascii': False})
|
||||||
|
except Article.DoesNotExist:
|
||||||
|
return JsonResponse({'error': '文章不存在'}, status=404)
|
||||||
|
|
||||||
|
def run_crawler(request):
|
||||||
|
result = []
|
||||||
|
if request.method == 'POST':
|
||||||
|
# 调用爬虫命令并获取输出
|
||||||
|
from io import StringIO
|
||||||
|
output = StringIO()
|
||||||
|
call_command('crawl_xinhua', stdout=output)
|
||||||
|
result.append(output.getvalue())
|
||||||
|
|
||||||
|
# 调试:打印模板路径
|
||||||
|
template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
|
||||||
|
print(f"🔍 正在查找模板文件:{template_path}")
|
||||||
|
|
||||||
|
# 调试:检查模板是否存在
|
||||||
|
try:
|
||||||
|
with open(template_path, 'r', encoding='utf-8') as f:
|
||||||
|
print("✅ 模板文件存在")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("❌ 模板文件不存在,请检查路径")
|
||||||
|
|
||||||
|
return render(request, 'collector/run_crawler.html', {'output': result})
|
||||||
|
|
||||||
|
def delete_all_articles(request):
|
||||||
|
"""
|
||||||
|
删除所有文章的视图
|
||||||
|
"""
|
||||||
|
if request.method == 'POST':
|
||||||
|
Article.objects.all().delete()
|
||||||
|
return redirect('collector:article_list')
|
||||||
|
return render(request, 'collector/delete_all_articles.html')
|
||||||
16
src/green_classroom/asgi.py
Normal file
16
src/green_classroom/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
ASGI config for green_classroom project.
|
||||||
|
|
||||||
|
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||||
|
|
||||||
|
application = get_asgi_application()
|
||||||
140
src/green_classroom/settings.py
Normal file
140
src/green_classroom/settings.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
"""
|
||||||
|
Django settings for green_classroom project.
|
||||||
|
|
||||||
|
Generated by 'django-admin startproject' using Django 5.1.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/topics/settings/
|
||||||
|
|
||||||
|
For the full list of settings and their values, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
# Quick-start development settings - unsuitable for production
|
||||||
|
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||||
|
|
||||||
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
|
SECRET_KEY = 'django-insecure-mi#9dyl0zwanl2=uziz3om_t**ovk08+pg127^+=5m=s6^+(@b'
|
||||||
|
|
||||||
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
|
DEBUG = True
|
||||||
|
#DEBUG = False
|
||||||
|
|
||||||
|
ALLOWED_HOSTS = ['*',]
|
||||||
|
|
||||||
|
|
||||||
|
# Application definition
|
||||||
|
|
||||||
|
INSTALLED_APPS = [
|
||||||
|
'django.contrib.admin',
|
||||||
|
'django.contrib.auth',
|
||||||
|
'django.contrib.contenttypes',
|
||||||
|
'django.contrib.sessions',
|
||||||
|
'django.contrib.messages',
|
||||||
|
'django.contrib.staticfiles',
|
||||||
|
'green_classroom.apps.collector.apps.CollectorConfig',
|
||||||
|
]
|
||||||
|
|
||||||
|
MIDDLEWARE = [
|
||||||
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
|
'django.middleware.common.CommonMiddleware',
|
||||||
|
'django.middleware.csrf.CsrfViewMiddleware',
|
||||||
|
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||||
|
'django.contrib.messages.middleware.MessageMiddleware',
|
||||||
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_URLCONF = 'green_classroom.urls'
|
||||||
|
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
|
'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在
|
||||||
|
'APP_DIRS': True,
|
||||||
|
'OPTIONS': {
|
||||||
|
'context_processors': [
|
||||||
|
'django.template.context_processors.debug',
|
||||||
|
'django.template.context_processors.request',
|
||||||
|
'django.contrib.auth.context_processors.auth',
|
||||||
|
'django.contrib.messages.context_processors.messages',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
WSGI_APPLICATION = 'green_classroom.wsgi.application'
|
||||||
|
|
||||||
|
|
||||||
|
# Database
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||||
|
|
||||||
|
DATABASES = {
|
||||||
|
'default': {
|
||||||
|
'ENGINE': 'django.db.backends.sqlite3',
|
||||||
|
'NAME': BASE_DIR / 'db.sqlite3',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Password validation
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||||
|
|
||||||
|
AUTH_PASSWORD_VALIDATORS = [
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Internationalization
|
||||||
|
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
||||||
|
|
||||||
|
LANGUAGE_CODE = 'en-us'
|
||||||
|
|
||||||
|
TIME_ZONE = 'UTC'
|
||||||
|
|
||||||
|
USE_I18N = True
|
||||||
|
|
||||||
|
USE_TZ = True
|
||||||
|
|
||||||
|
|
||||||
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||||
|
|
||||||
|
STATIC_URL = 'static/'
|
||||||
|
|
||||||
|
# Default primary key field type
|
||||||
|
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||||
|
|
||||||
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
|
# 增加最大字段数限制,适应 admin 页面大数据量展示
|
||||||
|
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
||||||
|
|
||||||
|
|
||||||
|
import sentry_sdk
|
||||||
|
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn="https://a976759c113a1e07050c61fb4dfe16bc@sentry.yuangyaa.com/2",
|
||||||
|
# Add data like request headers and IP for users,
|
||||||
|
# see https://docs.sentry.io/platforms/python/data-management/data-collected/ for more info
|
||||||
|
send_default_pii=True,
|
||||||
|
environment="staging", # 或其他你设置的环境名
|
||||||
|
)
|
||||||
11
src/green_classroom/templates/collector/article_detail.html
Normal file
11
src/green_classroom/templates/collector/article_detail.html
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>{{ article.title }}</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>{{ article.title }}</h1>
|
||||||
|
<p>{{ article.content }}</p>
|
||||||
|
<a href="{% url 'collector:article_list' %}">返回文章列表</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
19
src/green_classroom/templates/collector/article_list.html
Normal file
19
src/green_classroom/templates/collector/article_list.html
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>文章列表</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>所有文章</h1>
|
||||||
|
<ul>
|
||||||
|
{% for article in articles %}
|
||||||
|
<li>
|
||||||
|
<a href="{% url 'collector:article_detail' article_id=article.id %}">
|
||||||
|
<strong>{{ article.title }}</strong>
|
||||||
|
</a><br>
|
||||||
|
{{ article.content|truncatewords:50 }}
|
||||||
|
</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>删除所有文章</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>确认删除所有文章</h1>
|
||||||
|
<p>您确定要删除所有文章吗?此操作不可撤销。</p>
|
||||||
|
<form method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit">删除所有文章</button>
|
||||||
|
</form>
|
||||||
|
<a href="{% url 'collector:article_list' %}">取消</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>运行爬虫</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>运行爬虫</h1>
|
||||||
|
<form method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit">开始爬取</button>
|
||||||
|
</form>
|
||||||
|
{% if output %}
|
||||||
|
<h2>输出:</h2>
|
||||||
|
<pre>{{ output }}</pre>
|
||||||
|
{% endif %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
10
src/green_classroom/urls.py
Normal file
10
src/green_classroom/urls.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from django.urls import path, include
|
||||||
|
from django.contrib import admin # 新增导入
|
||||||
|
from green_classroom.apps.collector import views
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
|
||||||
|
path('articles/', views.list_articles, name='article_list'),
|
||||||
|
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
|
||||||
|
]
|
||||||
|
|
||||||
16
src/green_classroom/wsgi.py
Normal file
16
src/green_classroom/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
WSGI config for green_classroom project.
|
||||||
|
|
||||||
|
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||||
|
|
||||||
|
application = get_wsgi_application()
|
||||||
22
src/manage.py
Executable file
22
src/manage.py
Executable file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""Django's command-line utility for administrative tasks."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run administrative tasks."""
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||||
|
try:
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Couldn't import Django. Are you sure it's installed and "
|
||||||
|
"available on your PYTHONPATH environment variable? Did you "
|
||||||
|
"forget to activate a virtual environment?"
|
||||||
|
) from exc
|
||||||
|
execute_from_command_line(sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
10
src/test_xinhua_article.py
Normal file
10
src/test_xinhua_article.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from green_classroom.apps.collector.crawler.xinhua import parse_xinhua_article
|
||||||
|
|
||||||
|
url = "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html"
|
||||||
|
article = parse_xinhua_article(url)
|
||||||
|
|
||||||
|
if article:
|
||||||
|
print("✅ 成功抓取文章:", article["title"])
|
||||||
|
print("📄 正文预览:\n", article["content"][:500])
|
||||||
|
else:
|
||||||
|
print("❌ 抓取失败")
|
||||||
Reference in New Issue
Block a user