Support download
This commit is contained in:
@@ -461,27 +461,51 @@ class WebsiteCrawler:
|
||||
publish_date = self.extract_publish_date(article_soup)
|
||||
author = self.extract_author(article_soup)
|
||||
|
||||
# 保存内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
# 检查是否已存在相同URL的文章
|
||||
existing_content = CrawledContent.objects.filter(
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
task=self.task
|
||||
).first()
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
if existing_content:
|
||||
# 如果已存在,更新现有记录而不是创建新记录
|
||||
existing_content.title = title
|
||||
existing_content.content = content
|
||||
existing_content.publish_date = publish_date
|
||||
existing_content.author = author
|
||||
existing_content.keywords_matched = ','.join(matched_keywords)
|
||||
existing_content.save()
|
||||
|
||||
# 更新媒体文件
|
||||
# 先删除旧的媒体文件
|
||||
existing_content.media_files.all().delete()
|
||||
# 然后重新下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
|
||||
|
||||
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
|
||||
else:
|
||||
# 保存新内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
|
||||
self.log('info', f'保存新文章: {title[:50]}...', website)
|
||||
|
||||
crawled_count += 1
|
||||
self.log('info', f'保存文章: {title[:50]}...', website)
|
||||
|
||||
# 请求间隔
|
||||
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
|
||||
|
||||
Reference in New Issue
Block a user