diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py index 4763ad8..29433fd 100644 --- a/crawler/crawler_engine.py +++ b/crawler/crawler_engine.py @@ -461,27 +461,51 @@ class WebsiteCrawler: publish_date = self.extract_publish_date(article_soup) author = self.extract_author(article_soup) - # 保存内容 - crawled_content = CrawledContent.objects.create( - task=self.task, - website=website, - title=title, - content=content, + # 检查是否已存在相同URL的文章 + existing_content = CrawledContent.objects.filter( url=link_info['url'], - publish_date=publish_date, - author=author, - keywords_matched=','.join(matched_keywords), - is_local_saved=False # 初始设置为False,保存到本地后会更新为True - ) + task=self.task + ).first() - # 提取并下载媒体文件 - media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url']) - - # 标记内容已保存 - self.mark_content_saved(crawled_content) + if existing_content: + # 如果已存在,更新现有记录而不是创建新记录 + existing_content.title = title + existing_content.content = content + existing_content.publish_date = publish_date + existing_content.author = author + existing_content.keywords_matched = ','.join(matched_keywords) + existing_content.save() + + # 更新媒体文件 + # 先删除旧的媒体文件 + existing_content.media_files.all().delete() + # 然后重新下载媒体文件 + media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url']) + + self.log('info', f'更新已存在的文章: {title[:50]}...', website) + else: + # 保存新内容 + crawled_content = CrawledContent.objects.create( + task=self.task, + website=website, + title=title, + content=content, + url=link_info['url'], + publish_date=publish_date, + author=author, + keywords_matched=','.join(matched_keywords), + is_local_saved=False # 初始设置为False,保存到本地后会更新为True + ) + + # 提取并下载媒体文件 + media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url']) + + # 标记内容已保存 + self.mark_content_saved(crawled_content) + + self.log('info', f'保存新文章: {title[:50]}...', website) crawled_count += 1 - self.log('info', f'保存文章: {title[:50]}...', website) # 请求间隔 time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY']) diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html index 5114a79..c356159 100644 --- a/crawler/templates/crawler/dashboard.html +++ b/crawler/templates/crawler/dashboard.html @@ -151,61 +151,80 @@