From 32b1ec37e0d1b46fd7460fcea77753bdb773ff1c Mon Sep 17 00:00:00 2001 From: Thibauld Nion Date: Sat, 22 Jun 2024 21:40:03 +0200 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=20Use=20the=20unicode=20paragraph=20s?= =?UTF-8?q?eparator=20instead=20of=20\n\n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should reduce misfire of the second replace and allow for a more correct count of characters to avoid elipsing too much of the content. --- wom_tributary/utils/tweet_summarizers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wom_tributary/utils/tweet_summarizers.py b/wom_tributary/utils/tweet_summarizers.py index 078969b..ad01146 100644 --- a/wom_tributary/utils/tweet_summarizers.py +++ b/wom_tributary/utils/tweet_summarizers.py @@ -74,12 +74,14 @@ def from_activity_item(item, link_builder): def build_content_excerpt(content_unicode): - content_unicode = LINE_BREAK_REGEX.sub("\n\n", content_unicode) + unicode_paragraph = "\u2029" + content_unicode = LINE_BREAK_REGEX.sub(unicode_paragraph, content_unicode) content_unicode = html.unescape(strip_tags(content_unicode)).strip() excerpt = content_unicode[:MAX_CONTENT_SIZE_CHARS].strip() if len(excerpt) < len(content_unicode): excerpt += "(...)" - return excerpt.replace("\n\n", "
") + return excerpt.replace(unicode_paragraph, "
") + def build_tweet_index_by_tag(data, keep_only_after_datetime, link_builder): reverse_index = defaultdict(list)