Compare commits
8 Commits
149c051dab
...
develop
| Author | SHA1 | Date | |
|---|---|---|---|
| 61688f4bff | |||
| c750d77eab | |||
| babba5b923 | |||
| b273f22bba | |||
| 95efe66ad4 | |||
| 5b31b3ef3e | |||
| 3ecf847039 | |||
| e9ac04da60 |
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
# 默认忽略的文件
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# 基于编辑器的 HTTP 客户端请求
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
27
.idea/green_classroom.iml
generated
Normal file
27
.idea/green_classroom.iml
generated
Normal file
@@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="FacetManager">
|
||||
<facet type="django" name="Django">
|
||||
<configuration>
|
||||
<option name="rootFolder" value="$MODULE_DIR$/src" />
|
||||
<option name="settingsModule" value="green_classroom/settings.py" />
|
||||
<option name="manageScript" value="$MODULE_DIR$/src/manage.py" />
|
||||
<option name="environment" value="<map/>" />
|
||||
<option name="doNotUseTestRunner" value="false" />
|
||||
<option name="trackFilePattern" value="migrations" />
|
||||
</configuration>
|
||||
</facet>
|
||||
</component>
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.12 (green_classroom)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (green_classroom)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (green_classroom)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/green_classroom.iml" filepath="$PROJECT_DIR$/.idea/green_classroom.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
1
develop.txt
Normal file
1
develop.txt
Normal file
@@ -0,0 +1 @@
|
||||
# 開發分支
|
||||
6
requirements.in
Normal file
6
requirements.in
Normal file
@@ -0,0 +1,6 @@
|
||||
uv==0.7.22
|
||||
django==5.1
|
||||
psycopg2-binary
|
||||
requests
|
||||
beautifulsoup4
|
||||
ipython
|
||||
70
requirements.txt
Normal file
70
requirements.txt
Normal file
@@ -0,0 +1,70 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.12
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
#
|
||||
--index-url https://pypi.nicestudio.com.tw/root/pypi
|
||||
|
||||
asgiref==3.9.1
|
||||
# via django
|
||||
asttokens==3.0.0
|
||||
# via stack-data
|
||||
beautifulsoup4==4.13.4
|
||||
# via -r requirements.in
|
||||
certifi==2025.7.14
|
||||
# via requests
|
||||
charset-normalizer==3.4.2
|
||||
# via requests
|
||||
decorator==5.2.1
|
||||
# via ipython
|
||||
django==5.1
|
||||
# via -r requirements.in
|
||||
executing==2.2.0
|
||||
# via stack-data
|
||||
idna==3.10
|
||||
# via requests
|
||||
ipython==9.4.0
|
||||
# via -r requirements.in
|
||||
ipython-pygments-lexers==1.1.1
|
||||
# via ipython
|
||||
jedi==0.19.2
|
||||
# via ipython
|
||||
matplotlib-inline==0.1.7
|
||||
# via ipython
|
||||
parso==0.8.4
|
||||
# via jedi
|
||||
pexpect==4.9.0
|
||||
# via ipython
|
||||
prompt-toolkit==3.0.51
|
||||
# via ipython
|
||||
psycopg2-binary==2.9.10
|
||||
# via -r requirements.in
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
pure-eval==0.2.3
|
||||
# via stack-data
|
||||
pygments==2.19.2
|
||||
# via
|
||||
# ipython
|
||||
# ipython-pygments-lexers
|
||||
requests==2.32.4
|
||||
# via -r requirements.in
|
||||
soupsieve==2.7
|
||||
# via beautifulsoup4
|
||||
sqlparse==0.5.3
|
||||
# via django
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
traitlets==5.14.3
|
||||
# via
|
||||
# ipython
|
||||
# matplotlib-inline
|
||||
typing-extensions==4.14.1
|
||||
# via beautifulsoup4
|
||||
urllib3==2.5.0
|
||||
# via requests
|
||||
uv==0.7.22
|
||||
# via -r requirements.in
|
||||
wcwidth==0.2.13
|
||||
# via prompt-toolkit
|
||||
0
src/green_classroom/__init__.py
Normal file
0
src/green_classroom/__init__.py
Normal file
0
src/green_classroom/apps/collector/__init__.py
Normal file
0
src/green_classroom/apps/collector/__init__.py
Normal file
7
src/green_classroom/apps/collector/admin.py
Normal file
7
src/green_classroom/apps/collector/admin.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from django.contrib import admin
|
||||
from .models import Article
|
||||
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'url', 'crawled')
|
||||
search_fields = ('title',)
|
||||
6
src/green_classroom/apps/collector/apps.py
Normal file
6
src/green_classroom/apps/collector/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CollectorConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'green_classroom.apps.collector' # 修改为完整路径
|
||||
124
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
124
src/green_classroom/apps/collector/crawler/xinhua.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta
|
||||
import pytz # 确保已安装 pytz 库,用于处理时区
|
||||
|
||||
BASE_URL = "https://www.news.cn/"
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
|
||||
def parse_xinhua_article(url: str, time_range_days: int = None):
|
||||
"""
|
||||
解析新华网文章页,返回 dict 包含标题与正文。
|
||||
如果指定了 time_range_days,则只保留该时间范围内的文章。
|
||||
"""
|
||||
try:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=10)
|
||||
resp.encoding = 'utf-8'
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ 请求失败:{e} URL: {url}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
title_tag = soup.find("span", class_="title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
if not content_tag:
|
||||
print(f"❌ 没找到 detailContent: {url}")
|
||||
return None
|
||||
|
||||
paragraphs = content_tag.find_all("p")
|
||||
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
|
||||
|
||||
if len(content.strip()) < 50:
|
||||
print(f"⚠️ 内容过短:{url}")
|
||||
return None
|
||||
|
||||
# 提取发布时间(假设格式为 YYYY-MM-DD)
|
||||
publish_time_tag = soup.find("span", class_="publish-time")
|
||||
if publish_time_tag:
|
||||
publish_time_str = publish_time_tag.get_text(strip=True)
|
||||
try:
|
||||
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
|
||||
except ValueError:
|
||||
print(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
|
||||
return None
|
||||
else:
|
||||
print(f"❌ 页面未找到发布时间:{url}")
|
||||
return None
|
||||
|
||||
# 检查时间范围
|
||||
if time_range_days is not None:
|
||||
cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
|
||||
if publish_time < cutoff_time:
|
||||
print(f"⏰ 文章超出时间范围:{url}")
|
||||
return None
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"publish_time": publish_time
|
||||
}
|
||||
|
||||
def crawl_xinhua_green(time_range_days: int = None):
|
||||
"""
|
||||
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
|
||||
可选参数:time_range_days(仅爬取最近指定天数内的文章)
|
||||
"""
|
||||
print("✅ 开始爬取新华网栏目列表...")
|
||||
channels = get_channel_urls()
|
||||
print(f"共找到 {len(channels)} 个频道")
|
||||
|
||||
all_articles = []
|
||||
|
||||
# 并发抓取每个频道的文章链接
|
||||
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||
future_to_channel = {
|
||||
executor.submit(get_article_urls_from_channel, ch_url): ch_url
|
||||
for ch_url in channels
|
||||
}
|
||||
|
||||
for future in as_completed(future_to_channel):
|
||||
ch_url = future_to_channel[future]
|
||||
try:
|
||||
articles = future.result()
|
||||
print(f"\n➡️ 抓取频道:{ch_url}")
|
||||
print(f" 该频道找到 {len(articles)} 篇文章")
|
||||
|
||||
# 并发解析每篇文章
|
||||
with ThreadPoolExecutor(max_workers=5) as article_executor:
|
||||
article_futures = {
|
||||
article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
|
||||
for art_url in articles
|
||||
}
|
||||
|
||||
for article_future in as_completed(article_futures):
|
||||
article = article_future.result()
|
||||
if article:
|
||||
print(f" ✔️ 文章:{article['title']}")
|
||||
# 更新或创建文章,并标记 crawled=True
|
||||
Article.objects.update_or_create(
|
||||
url=article['url'],
|
||||
defaults={
|
||||
'title': article['title'],
|
||||
'content': article['content'],
|
||||
'crawled': True # 标记为已爬取
|
||||
}
|
||||
)
|
||||
all_articles.append(article)
|
||||
else:
|
||||
print(f" ❌ 文章解析失败:{article_futures[article_future]}")
|
||||
except Exception as exc:
|
||||
print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
|
||||
|
||||
print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇")
|
||||
return all_articles
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章
|
||||
@@ -0,0 +1,10 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '爬取新华网文章并保存到数据库'
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
self.stdout.write("开始爬取...")
|
||||
articles = crawl_xinhua_green()
|
||||
self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章")
|
||||
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 5.1 on 2025-07-21 12:00
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='SourceSite',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100)),
|
||||
('url', models.URLField()),
|
||||
('is_active', models.BooleanField(default=True)),
|
||||
('remarks', models.TextField(blank=True)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Article',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('title', models.CharField(max_length=300)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('publish_date', models.DateField(blank=True, null=True)),
|
||||
('content', models.TextField()),
|
||||
('category', models.CharField(choices=[('政策', '政策'), ('案例', '案例'), ('新闻', '新闻'), ('科研', '科研')], max_length=100)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True)),
|
||||
('is_verified', models.BooleanField(default=False)),
|
||||
('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='collector.sourcesite')),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,39 @@
|
||||
# Generated by Django 5.1 on 2025-07-21 12:52
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('collector', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='article',
|
||||
old_name='is_verified',
|
||||
new_name='crawled',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='article',
|
||||
name='category',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='article',
|
||||
name='created_at',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='article',
|
||||
name='publish_date',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='article',
|
||||
name='source',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='article',
|
||||
name='title',
|
||||
field=models.CharField(max_length=255),
|
||||
),
|
||||
]
|
||||
19
src/green_classroom/apps/collector/models.py
Normal file
19
src/green_classroom/apps/collector/models.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from django.db import models
|
||||
|
||||
class SourceSite(models.Model):
|
||||
name = models.CharField(max_length=100)
|
||||
url = models.URLField()
|
||||
is_active = models.BooleanField(default=True)
|
||||
remarks = models.TextField(blank=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
class Article(models.Model):
|
||||
url = models.URLField(unique=True)
|
||||
title = models.CharField(max_length=255)
|
||||
content = models.TextField()
|
||||
crawled = models.BooleanField(default=False) # 确保此字段存在
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
@@ -0,0 +1,16 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{ article.title }}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{ article.title }}</h1>
|
||||
<p><strong>来源:</strong>{{ article.source.name }}</p>
|
||||
<p><strong>分类:</strong>{{ article.category }}</p>
|
||||
<p><strong>发布时间:</strong>{{ article.publish_date }}</p>
|
||||
<div>
|
||||
{{ article.content|linebreaks }}
|
||||
</div>
|
||||
<p><a href="{% url 'collector:article_list' %}">返回列表</a></p>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,24 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>绿色课堂资料库</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>绿色课堂资料库</h1>
|
||||
<form method="get">
|
||||
<input type="text" name="q" placeholder="关键词搜索..." value="{{ request.GET.q }}">
|
||||
<button type="submit">搜索</button>
|
||||
</form>
|
||||
|
||||
<ul>
|
||||
{% for article in articles %}
|
||||
<li>
|
||||
<a href="{% url 'collector:article_detail' article.id %}">{{ article.title }}</a>
|
||||
({{ article.category }} | {{ article.publish_date }})
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无内容</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
3
src/green_classroom/apps/collector/tests.py
Normal file
3
src/green_classroom/apps/collector/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
10
src/green_classroom/apps/collector/urls.py
Normal file
10
src/green_classroom/apps/collector/urls.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from django.urls import path
|
||||
from green_classroom.apps.collector import views
|
||||
|
||||
app_name = 'collector' # 确保命名空间正确
|
||||
|
||||
urlpatterns = [
|
||||
path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
|
||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||
path('articles/', views.list_articles, name='article_list'), # 添加这一行
|
||||
]
|
||||
51
src/green_classroom/apps/collector/views.py
Normal file
51
src/green_classroom/apps/collector/views.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.core.management import call_command
|
||||
from green_classroom.apps.collector.models import Article
|
||||
import os
|
||||
from django.conf import settings
|
||||
from django.template import TemplateDoesNotExist
|
||||
|
||||
def list_articles(request):
|
||||
"""
|
||||
展示所有文章的视图
|
||||
"""
|
||||
articles = Article.objects.all()
|
||||
return render(request, 'collector/article_list.html', {'articles': articles})
|
||||
|
||||
def article_detail(request, article_id):
|
||||
"""
|
||||
展示单篇文章的详细内容
|
||||
"""
|
||||
article = get_object_or_404(Article, id=article_id)
|
||||
return render(request, 'collector/article_detail.html', {'article': article})
|
||||
|
||||
def run_crawler(request):
|
||||
result = []
|
||||
if request.method == 'POST':
|
||||
# 调用爬虫命令并获取输出
|
||||
from io import StringIO
|
||||
output = StringIO()
|
||||
call_command('crawl_xinhua', stdout=output)
|
||||
result.append(output.getvalue())
|
||||
|
||||
# 调试:打印模板路径
|
||||
template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
|
||||
print(f"🔍 正在查找模板文件:{template_path}")
|
||||
|
||||
# 调试:检查模板是否存在
|
||||
try:
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
print("✅ 模板文件存在")
|
||||
except FileNotFoundError:
|
||||
print("❌ 模板文件不存在,请检查路径")
|
||||
|
||||
return render(request, 'collector/run_crawler.html', {'output': result})
|
||||
|
||||
def delete_all_articles(request):
|
||||
"""
|
||||
删除所有文章的视图
|
||||
"""
|
||||
if request.method == 'POST':
|
||||
Article.objects.all().delete()
|
||||
return redirect('collector:article_list')
|
||||
return render(request, 'collector/delete_all_articles.html')
|
||||
16
src/green_classroom/asgi.py
Normal file
16
src/green_classroom/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
ASGI config for green_classroom project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
|
||||
application = get_asgi_application()
|
||||
128
src/green_classroom/settings.py
Normal file
128
src/green_classroom/settings.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
Django settings for green_classroom project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 5.1.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-mi#9dyl0zwanl2=uziz3om_t**ovk08+pg127^+=5m=s6^+(@b'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'green_classroom.apps.collector.apps.CollectorConfig',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'green_classroom.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'green_classroom.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
# 增加最大字段数限制,适应 admin 页面大数据量展示
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
||||
11
src/green_classroom/templates/collector/article_detail.html
Normal file
11
src/green_classroom/templates/collector/article_detail.html
Normal file
@@ -0,0 +1,11 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>{{ article.title }}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{{ article.title }}</h1>
|
||||
<p>{{ article.content }}</p>
|
||||
<a href="{% url 'collector:article_list' %}">返回文章列表</a>
|
||||
</body>
|
||||
</html>
|
||||
19
src/green_classroom/templates/collector/article_list.html
Normal file
19
src/green_classroom/templates/collector/article_list.html
Normal file
@@ -0,0 +1,19 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>文章列表</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>所有文章</h1>
|
||||
<ul>
|
||||
{% for article in articles %}
|
||||
<li>
|
||||
<a href="{% url 'collector:article_detail' article_id=article.id %}">
|
||||
<strong>{{ article.title }}</strong>
|
||||
</a><br>
|
||||
{{ article.content|truncatewords:50 }}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,15 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>删除所有文章</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>确认删除所有文章</h1>
|
||||
<p>您确定要删除所有文章吗?此操作不可撤销。</p>
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
<button type="submit">删除所有文章</button>
|
||||
</form>
|
||||
<a href="{% url 'collector:article_list' %}">取消</a>
|
||||
</body>
|
||||
</html>
|
||||
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
17
src/green_classroom/templates/collector/run_crawler.html
Normal file
@@ -0,0 +1,17 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>运行爬虫</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>运行爬虫</h1>
|
||||
<form method="post">
|
||||
{% csrf_token %}
|
||||
<button type="submit">开始爬取</button>
|
||||
</form>
|
||||
{% if output %}
|
||||
<h2>输出:</h2>
|
||||
<pre>{{ output }}</pre>
|
||||
{% endif %}
|
||||
</body>
|
||||
</html>
|
||||
9
src/green_classroom/urls.py
Normal file
9
src/green_classroom/urls.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from django.urls import path, include
|
||||
from django.contrib import admin # 新增导入
|
||||
from green_classroom.apps.collector import views
|
||||
|
||||
urlpatterns = [
|
||||
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
|
||||
path('articles/', views.list_articles, name='article_list'),
|
||||
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
|
||||
]
|
||||
16
src/green_classroom/wsgi.py
Normal file
16
src/green_classroom/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
WSGI config for green_classroom project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
|
||||
application = get_wsgi_application()
|
||||
22
src/manage.py
Executable file
22
src/manage.py
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
10
src/test_xinhua_article.py
Normal file
10
src/test_xinhua_article.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from green_classroom.apps.collector.crawler.xinhua import parse_xinhua_article
|
||||
|
||||
url = "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html"
|
||||
article = parse_xinhua_article(url)
|
||||
|
||||
if article:
|
||||
print("✅ 成功抓取文章:", article["title"])
|
||||
print("📄 正文预览:\n", article["content"][:500])
|
||||
else:
|
||||
print("❌ 抓取失败")
|
||||
Reference in New Issue
Block a user