Support download

This commit is contained in:
2025-09-23 15:01:36 +08:00
parent 45c005687d
commit f15b730dca
4 changed files with 227 additions and 64 deletions

View File

@@ -461,27 +461,51 @@ class WebsiteCrawler:
publish_date = self.extract_publish_date(article_soup)
author = self.extract_author(article_soup)
# 保存内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
# 检查是否已存在相同URL的文章
existing_content = CrawledContent.objects.filter(
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
task=self.task
).first()
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
# 标记内容已保存
self.mark_content_saved(crawled_content)
if existing_content:
# 如果已存在,更新现有记录而不是创建新记录
existing_content.title = title
existing_content.content = content
existing_content.publish_date = publish_date
existing_content.author = author
existing_content.keywords_matched = ','.join(matched_keywords)
existing_content.save()
# 更新媒体文件
# 先删除旧的媒体文件
existing_content.media_files.all().delete()
# 然后重新下载媒体文件
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
else:
# 保存新内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
# 标记内容已保存
self.mark_content_saved(crawled_content)
self.log('info', f'保存新文章: {title[:50]}...', website)
crawled_count += 1
self.log('info', f'保存文章: {title[:50]}...', website)
# 请求间隔
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])