Compare commits

8 Commits

Author SHA1 Message Date
61688f4bff add 多线程 && 新华网 2025-07-21 21:16:27 +08:00
c750d77eab Test success 2025-07-21 20:40:36 +08:00
babba5b923 create app colletcor 2025-07-18 04:22:33 +08:00
b273f22bba start django_project 2025-07-18 04:11:14 +08:00
95efe66ad4 Add 0.1 初始內容 2025-07-18 04:01:42 +08:00
5b31b3ef3e this is base env 2025-07-18 04:00:21 +08:00
3ecf847039 Init develop branch 2025-07-18 03:59:22 +08:00
e9ac04da60 Initial commit 2025-07-18 03:59:10 +08:00
39 changed files with 752 additions and 0 deletions

8
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

27
.idea/green_classroom.iml generated Normal file
View File

@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="FacetManager">
<facet type="django" name="Django">
<configuration>
<option name="rootFolder" value="$MODULE_DIR$/src" />
<option name="settingsModule" value="green_classroom/settings.py" />
<option name="manageScript" value="$MODULE_DIR$/src/manage.py" />
<option name="environment" value="&lt;map/&gt;" />
<option name="doNotUseTestRunner" value="false" />
<option name="trackFilePattern" value="migrations" />
</configuration>
</facet>
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.12 (green_classroom)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated Normal file
View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (green_classroom)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (green_classroom)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/green_classroom.iml" filepath="$PROJECT_DIR$/.idea/green_classroom.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@@ -1,3 +1,4 @@
# green_classroom
行政办 绿色课堂
# init projects

1
develop.txt Normal file
View File

@@ -0,0 +1 @@
# 開發分支

1
init.txt Normal file
View File

@@ -0,0 +1 @@
0.1 初始內容

6
requirements.in Normal file
View File

@@ -0,0 +1,6 @@
uv==0.7.22
django==5.1
psycopg2-binary
requests
beautifulsoup4
ipython

70
requirements.txt Normal file
View File

@@ -0,0 +1,70 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile requirements.in
#
--index-url https://pypi.nicestudio.com.tw/root/pypi
asgiref==3.9.1
# via django
asttokens==3.0.0
# via stack-data
beautifulsoup4==4.13.4
# via -r requirements.in
certifi==2025.7.14
# via requests
charset-normalizer==3.4.2
# via requests
decorator==5.2.1
# via ipython
django==5.1
# via -r requirements.in
executing==2.2.0
# via stack-data
idna==3.10
# via requests
ipython==9.4.0
# via -r requirements.in
ipython-pygments-lexers==1.1.1
# via ipython
jedi==0.19.2
# via ipython
matplotlib-inline==0.1.7
# via ipython
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
prompt-toolkit==3.0.51
# via ipython
psycopg2-binary==2.9.10
# via -r requirements.in
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pygments==2.19.2
# via
# ipython
# ipython-pygments-lexers
requests==2.32.4
# via -r requirements.in
soupsieve==2.7
# via beautifulsoup4
sqlparse==0.5.3
# via django
stack-data==0.6.3
# via ipython
traitlets==5.14.3
# via
# ipython
# matplotlib-inline
typing-extensions==4.14.1
# via beautifulsoup4
urllib3==2.5.0
# via requests
uv==0.7.22
# via -r requirements.in
wcwidth==0.2.13
# via prompt-toolkit

View File

View File

@@ -0,0 +1,7 @@
from django.contrib import admin
from .models import Article
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'url', 'crawled')
search_fields = ('title',)

View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class CollectorConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'green_classroom.apps.collector' # 修改为完整路径

View File

@@ -0,0 +1,124 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
import pytz # 确保已安装 pytz 库,用于处理时区
BASE_URL = "https://www.news.cn/"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}
def parse_xinhua_article(url: str, time_range_days: int = None):
"""
解析新华网文章页,返回 dict 包含标题与正文。
如果指定了 time_range_days则只保留该时间范围内的文章。
"""
try:
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.encoding = 'utf-8'
except requests.RequestException as e:
print(f"❌ 请求失败:{e} URL: {url}")
return None
soup = BeautifulSoup(resp.text, "html.parser")
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
content_tag = soup.find("span", id="detailContent")
if not content_tag:
print(f"❌ 没找到 detailContent: {url}")
return None
paragraphs = content_tag.find_all("p")
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
if len(content.strip()) < 50:
print(f"⚠️ 内容过短:{url}")
return None
# 提取发布时间(假设格式为 YYYY-MM-DD
publish_time_tag = soup.find("span", class_="publish-time")
if publish_time_tag:
publish_time_str = publish_time_tag.get_text(strip=True)
try:
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
except ValueError:
print(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
return None
else:
print(f"❌ 页面未找到发布时间:{url}")
return None
# 检查时间范围
if time_range_days is not None:
cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
if publish_time < cutoff_time:
print(f"⏰ 文章超出时间范围:{url}")
return None
return {
"url": url,
"title": title,
"content": content,
"publish_time": publish_time
}
def crawl_xinhua_green(time_range_days: int = None):
"""
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
可选参数time_range_days仅爬取最近指定天数内的文章
"""
print("✅ 开始爬取新华网栏目列表...")
channels = get_channel_urls()
print(f"共找到 {len(channels)} 个频道")
all_articles = []
# 并发抓取每个频道的文章链接
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_channel = {
executor.submit(get_article_urls_from_channel, ch_url): ch_url
for ch_url in channels
}
for future in as_completed(future_to_channel):
ch_url = future_to_channel[future]
try:
articles = future.result()
print(f"\n➡️ 抓取频道:{ch_url}")
print(f" 该频道找到 {len(articles)} 篇文章")
# 并发解析每篇文章
with ThreadPoolExecutor(max_workers=5) as article_executor:
article_futures = {
article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
for art_url in articles
}
for article_future in as_completed(article_futures):
article = article_future.result()
if article:
print(f" ✔️ 文章:{article['title']}")
# 更新或创建文章,并标记 crawled=True
Article.objects.update_or_create(
url=article['url'],
defaults={
'title': article['title'],
'content': article['content'],
'crawled': True # 标记为已爬取
}
)
all_articles.append(article)
else:
print(f" ❌ 文章解析失败:{article_futures[article_future]}")
except Exception as exc:
print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)}")
return all_articles
if __name__ == "__main__":
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章

View File

@@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
class Command(BaseCommand):
help = '爬取新华网文章并保存到数据库'
def handle(self, *args, **kwargs):
self.stdout.write("开始爬取...")
articles = crawl_xinhua_green()
self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章")

View File

@@ -0,0 +1,39 @@
# Generated by Django 5.1 on 2025-07-21 12:00
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='SourceSite',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100)),
('url', models.URLField()),
('is_active', models.BooleanField(default=True)),
('remarks', models.TextField(blank=True)),
],
),
migrations.CreateModel(
name='Article',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('title', models.CharField(max_length=300)),
('url', models.URLField(unique=True)),
('publish_date', models.DateField(blank=True, null=True)),
('content', models.TextField()),
('category', models.CharField(choices=[('政策', '政策'), ('案例', '案例'), ('新闻', '新闻'), ('科研', '科研')], max_length=100)),
('created_at', models.DateTimeField(auto_now_add=True)),
('is_verified', models.BooleanField(default=False)),
('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='collector.sourcesite')),
],
),
]

View File

@@ -0,0 +1,39 @@
# Generated by Django 5.1 on 2025-07-21 12:52
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('collector', '0001_initial'),
]
operations = [
migrations.RenameField(
model_name='article',
old_name='is_verified',
new_name='crawled',
),
migrations.RemoveField(
model_name='article',
name='category',
),
migrations.RemoveField(
model_name='article',
name='created_at',
),
migrations.RemoveField(
model_name='article',
name='publish_date',
),
migrations.RemoveField(
model_name='article',
name='source',
),
migrations.AlterField(
model_name='article',
name='title',
field=models.CharField(max_length=255),
),
]

View File

@@ -0,0 +1,19 @@
from django.db import models
class SourceSite(models.Model):
name = models.CharField(max_length=100)
url = models.URLField()
is_active = models.BooleanField(default=True)
remarks = models.TextField(blank=True)
def __str__(self):
return self.name
class Article(models.Model):
url = models.URLField(unique=True)
title = models.CharField(max_length=255)
content = models.TextField()
crawled = models.BooleanField(default=False) # 确保此字段存在
def __str__(self):
return self.title

View File

@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ article.title }}</title>
</head>
<body>
<h1>{{ article.title }}</h1>
<p><strong>来源:</strong>{{ article.source.name }}</p>
<p><strong>分类:</strong>{{ article.category }}</p>
<p><strong>发布时间:</strong>{{ article.publish_date }}</p>
<div>
{{ article.content|linebreaks }}
</div>
<p><a href="{% url 'collector:article_list' %}">返回列表</a></p>
</body>
</html>

View File

@@ -0,0 +1,24 @@
<!DOCTYPE html>
<html>
<head>
<title>绿色课堂资料库</title>
</head>
<body>
<h1>绿色课堂资料库</h1>
<form method="get">
<input type="text" name="q" placeholder="关键词搜索..." value="{{ request.GET.q }}">
<button type="submit">搜索</button>
</form>
<ul>
{% for article in articles %}
<li>
<a href="{% url 'collector:article_detail' article.id %}">{{ article.title }}</a>
{{ article.category }} | {{ article.publish_date }}
</li>
{% empty %}
<li>暂无内容</li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@@ -0,0 +1,10 @@
from django.urls import path
from green_classroom.apps.collector import views
app_name = 'collector' # 确保命名空间正确
urlpatterns = [
path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
path('articles/', views.list_articles, name='article_list'), # 添加这一行
]

View File

@@ -0,0 +1,51 @@
from django.shortcuts import render, get_object_or_404
from django.core.management import call_command
from green_classroom.apps.collector.models import Article
import os
from django.conf import settings
from django.template import TemplateDoesNotExist
def list_articles(request):
"""
展示所有文章的视图
"""
articles = Article.objects.all()
return render(request, 'collector/article_list.html', {'articles': articles})
def article_detail(request, article_id):
"""
展示单篇文章的详细内容
"""
article = get_object_or_404(Article, id=article_id)
return render(request, 'collector/article_detail.html', {'article': article})
def run_crawler(request):
result = []
if request.method == 'POST':
# 调用爬虫命令并获取输出
from io import StringIO
output = StringIO()
call_command('crawl_xinhua', stdout=output)
result.append(output.getvalue())
# 调试:打印模板路径
template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
print(f"🔍 正在查找模板文件:{template_path}")
# 调试:检查模板是否存在
try:
with open(template_path, 'r', encoding='utf-8') as f:
print("✅ 模板文件存在")
except FileNotFoundError:
print("❌ 模板文件不存在,请检查路径")
return render(request, 'collector/run_crawler.html', {'output': result})
def delete_all_articles(request):
"""
删除所有文章的视图
"""
if request.method == 'POST':
Article.objects.all().delete()
return redirect('collector:article_list')
return render(request, 'collector/delete_all_articles.html')

View File

@@ -0,0 +1,16 @@
"""
ASGI config for green_classroom project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
application = get_asgi_application()

View File

@@ -0,0 +1,128 @@
"""
Django settings for green_classroom project.
Generated by 'django-admin startproject' using Django 5.1.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-mi#9dyl0zwanl2=uziz3om_t**ovk08+pg127^+=5m=s6^+(@b'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'green_classroom.apps.collector.apps.CollectorConfig',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'green_classroom.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'green_classroom.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# 增加最大字段数限制,适应 admin 页面大数据量展示
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ article.title }}</title>
</head>
<body>
<h1>{{ article.title }}</h1>
<p>{{ article.content }}</p>
<a href="{% url 'collector:article_list' %}">返回文章列表</a>
</body>
</html>

View File

@@ -0,0 +1,19 @@
<!DOCTYPE html>
<html>
<head>
<title>文章列表</title>
</head>
<body>
<h1>所有文章</h1>
<ul>
{% for article in articles %}
<li>
<a href="{% url 'collector:article_detail' article_id=article.id %}">
<strong>{{ article.title }}</strong>
</a><br>
{{ article.content|truncatewords:50 }}
</li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html>
<head>
<title>删除所有文章</title>
</head>
<body>
<h1>确认删除所有文章</h1>
<p>您确定要删除所有文章吗?此操作不可撤销。</p>
<form method="post">
{% csrf_token %}
<button type="submit">删除所有文章</button>
</form>
<a href="{% url 'collector:article_list' %}">取消</a>
</body>
</html>

View File

@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html>
<head>
<title>运行爬虫</title>
</head>
<body>
<h1>运行爬虫</h1>
<form method="post">
{% csrf_token %}
<button type="submit">开始爬取</button>
</form>
{% if output %}
<h2>输出:</h2>
<pre>{{ output }}</pre>
{% endif %}
</body>
</html>

View File

@@ -0,0 +1,9 @@
from django.urls import path, include
from django.contrib import admin # 新增导入
from green_classroom.apps.collector import views
urlpatterns = [
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
path('articles/', views.list_articles, name='article_list'),
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
]

View File

@@ -0,0 +1,16 @@
"""
WSGI config for green_classroom project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
application = get_wsgi_application()

22
src/manage.py Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,10 @@
from green_classroom.apps.collector.crawler.xinhua import parse_xinhua_article
url = "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html"
article = parse_xinhua_article(url)
if article:
print("✅ 成功抓取文章:", article["title"])
print("📄 正文预览:\n", article["content"][:500])
else:
print("❌ 抓取失败")