diff --git a/docs/source/topics/text-formatting.rst b/docs/source/topics/text-formatting.rst index 3ab0a2d95d..0619c2a5a6 100644 --- a/docs/source/topics/text-formatting.rst +++ b/docs/source/topics/text-formatting.rst @@ -40,8 +40,98 @@ list of the basic styles currently supported by Pyrogram. - spoiler - `text URL `_ - `user text mention `_ +- :emoji:`👍` +HTML Style +---------- + +To strictly use this mode, pass :obj:`~pyrogram.enums.HTML` to the *parse_mode* parameter when using +:meth:`~pyrogram.Client.send_message`. The following tags are currently supported: + +.. code-block:: text + + bold, bold + + italic, italic + + underline + + strike, strike, strike + + spoiler + + text URL + + inline mention + + inline fixed-width code + + 👍 + +
+    pre-formatted
+      fixed-width
+        code block
+    
+ +**Example**: + +.. code-block:: python + + from pyrogram.enums import ParseMode + + await app.send_message( + chat_id="me", + text=( + "bold, bold" + "italic, italic" + "underline, underline" + "strike, strike, strike" + "spoiler\n\n" + + "bold italic bold italic bold strike italic bold strike spoiler underline italic bold bold\n\n" + + "inline URL " + "inline mention of a user\n" + "👍 " + "inline fixed-width code " + "
pre-formatted fixed-width code block
\n\n" + "
"
+            "for i in range(10):\n"
+            "    print(i)"
+            "
\n\n" + + "
Block quotation started" + "Block quotation continued" + "The last line of the block quotation
" + "
Expandable block quotation started" + "Expandable block quotation continued" + "Expandable block quotation continued" + "Hidden by default part of the block quotation started" + "Expandable block quotation continued" + "The last line of the block quotation
" + ), + parse_mode=ParseMode.HTML + ) + +.. note:: + + All ``<``, ``>`` and ``&`` symbols that are not a part of a tag or an HTML entity must be replaced with the + corresponding HTML entities (``<`` with ``<``, ``>`` with ``>`` and ``&`` with ``&``). You can use this + snippet to quickly escape those characters: + + .. code-block:: python + + text = "" + text = text.replace("<", "<").replace("&", "&") + + print(text) + + .. code-block:: text + + <my & text> + Markdown Style -------------- @@ -107,11 +197,12 @@ To strictly use this mode, pass :obj:`~pyrogram.enums.ParseMode.MARKDOWN` to the "~~strike~~, " "||spoiler||, " "[URL](https://telegramplayground.github.io/pyrogram/), " + "![👍](tg://emoji?id=5469770542288478598)" "`code`, " - "```" + "```py" "for i in range(10):\n" " print(i)" - "```" + "```\n" ">blockquote\n" @@ -135,96 +226,6 @@ To strictly use this mode, pass :obj:`~pyrogram.enums.ParseMode.MARKDOWN` to the parse_mode=ParseMode.MARKDOWN ) -HTML Style ----------- - -To strictly use this mode, pass :obj:`~pyrogram.enums.HTML` to the *parse_mode* parameter when using -:meth:`~pyrogram.Client.send_message`. The following tags are currently supported: - -.. code-block:: text - - bold, bold - - italic, italic - - underline - - strike, strike, strike - - spoiler - - text URL - - inline mention - - inline fixed-width code - - 🔥 - -
-    pre-formatted
-      fixed-width
-        code block
-    
- -**Example**: - -.. code-block:: python - - from pyrogram.enums import ParseMode - - await app.send_message( - chat_id="me", - text=( - "bold, bold" - "italic, italic" - "underline, underline" - "strike, strike, strike" - "spoiler\n\n" - - "bold italic bold italic bold strike italic bold strike spoiler underline italic bold bold\n\n" - - "inline URL " - "inline mention of a user\n" - "👍 " - "inline fixed-width code " - "
pre-formatted fixed-width code block
\n\n" - "
"
-            "for i in range(10):\n"
-            "    print(i)"
-            "
\n\n" - - "
Block quotation started" - "Block quotation continued" - "The last line of the block quotation
" - "
Expandable block quotation started" - "Expandable block quotation continued" - "Expandable block quotation continued" - "Hidden by default part of the block quotation started" - "Expandable block quotation continued" - "The last line of the block quotation
" - ), - parse_mode=ParseMode.HTML - ) - -.. note:: - - All ``<``, ``>`` and ``&`` symbols that are not a part of a tag or an HTML entity must be replaced with the - corresponding HTML entities (``<`` with ``<``, ``>`` with ``>`` and ``&`` with ``&``). You can use this - snippet to quickly escape those characters: - - .. code-block:: python - - import html - - text = "" - text = html.escape(text) - - print(text) - - .. code-block:: text - - <my text> Different Styles ---------------- @@ -272,6 +273,13 @@ Result: Nested and Overlapping Entities ------------------------------- +.. warning:: + + The Markdown style is not recommended for complex text formatting. + + If you want to use complex text formatting such as nested entities, overlapping entities use the HTML style instead. + + You can also style texts with more than one decoration at once by nesting entities together. For example, you can send a text message with both :bold-underline:`bold and underline` styles, or a text that has both :strike-italic:`italic and strike` styles, and you can still combine both Markdown and HTML together. diff --git a/pyrogram/parser/__init__.py b/pyrogram/parser/__init__.py index 00c7acae76..af477e50fe 100644 --- a/pyrogram/parser/__init__.py +++ b/pyrogram/parser/__init__.py @@ -1,5 +1,5 @@ # Pyrogram - Telegram MTProto API Client Library for Python -# Copyright (C) 2017-present Dan +# Copyright (C) 2017-present # # This file is part of Pyrogram. # diff --git a/pyrogram/parser/html.py b/pyrogram/parser/html.py index 594feba04b..f5e53250ac 100644 --- a/pyrogram/parser/html.py +++ b/pyrogram/parser/html.py @@ -1,5 +1,5 @@ # Pyrogram - Telegram MTProto API Client Library for Python -# Copyright (C) 2017-present Dan +# Copyright (C) 2017-present # # This file is part of Pyrogram. # @@ -178,16 +178,13 @@ def parse_one(entity): language = getattr(entity, "language", "") or "" start_tag = f'<{name} language="{language}">' if language else f"<{name}>" end_tag = f"" - elif entity_type == MessageEntityType.BLOCKQUOTE: - name = entity_type.name.lower() - start_tag = f"<{name}>" - end_tag = f"" elif entity_type == MessageEntityType.EXPANDABLE_BLOCKQUOTE: name = "blockquote" start_tag = f"<{name} expandable>" end_tag = f"" elif entity_type in ( MessageEntityType.CODE, + MessageEntityType.BLOCKQUOTE, MessageEntityType.SPOILER, ): name = entity_type.name.lower() diff --git a/pyrogram/parser/markdown.py b/pyrogram/parser/markdown.py index 86676cc2fc..167c1655e1 100644 --- a/pyrogram/parser/markdown.py +++ b/pyrogram/parser/markdown.py @@ -1,5 +1,5 @@ # Pyrogram - Telegram MTProto API Client Library for Python -# Copyright (C) 2017-present Dan +# Copyright (C) 2017-present # # This file is part of Pyrogram. # @@ -36,8 +36,7 @@ BLOCKQUOTE_DELIM = ">" BLOCKQUOTE_ESCAPE_DELIM = "|>" BLOCKQUOTE_EXPANDABLE_DELIM = "**>" -BLOCKQUOTE_EXPANDABLE_END_DELIM = "<**" - +BLOCKQUOTE_EXPANDABLE_OPTIONAL_END_DELIM = "<**" MARKDOWN_RE = re.compile( r"({d})|(!?)\[(.+?)\]\((.+?)\)".format( @@ -66,6 +65,8 @@ URL_MARKUP = '{}' EMOJI_MARKUP = "{}" FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM] +CODE_TAG_RE = re.compile(r".*?") +URL_RE = re.compile(r"(!?)\[(.+?)\]\((.+?)\)") class Markdown: @@ -80,6 +81,7 @@ def escape_and_create_quotes(text: str, strict: bool): html_escaped_list: list[int] = [] # Temporary Queue to hold lines to be quoted + # Index and Line to_quote_list: list[tuple[int, str]] = [] def create_blockquote(quote_type: str = "") -> None: @@ -91,83 +93,84 @@ def create_blockquote(quote_type: str = "") -> None: if len(to_quote_list) == 0: return - joined_lines = "\n".join([i[1] for i in to_quote_list]) + # Create quoted text block + joined_lines = "\n".join([text for _, text in to_quote_list]) first_line_index, _ = to_quote_list[0] - text_lines[first_line_index] = ( - f"{joined_lines}" - ) - - for line_to_remove in to_quote_list[1:]: - text_lines[line_to_remove[0]] = None - to_quote_list.clear() + # Enclose the block in html quote + # and add to starting index of quoted line + text_lines[first_line_index] = f"{joined_lines}" - # Handle Expandable Quote - inside_blockquote = False - for index, line in enumerate(text_lines): - if line.startswith(BLOCKQUOTE_EXPANDABLE_DELIM) and not inside_blockquote: - delim_stripped_line = line[3:] - parsed_line = ( - html.escape(delim_stripped_line) if strict else delim_stripped_line - ) + # Set None Placeholders for preserving indexes + for idx, line_to_remove in to_quote_list[1:]: + text_lines[idx] = None - to_quote_list.append((index, parsed_line)) - html_escaped_list.append(index) - - inside_blockquote = True - continue + # clear queue + to_quote_list.clear() - elif line.endswith(BLOCKQUOTE_EXPANDABLE_END_DELIM) and inside_blockquote: - delim_stripped_line = line[:-3] - parsed_line = ( - html.escape(delim_stripped_line) if strict else delim_stripped_line - ) + def process_text(start_delimiter, end_delimiter: str = "", quote_type: str = ""): + for index, line in enumerate(text_lines): + # Ignore None placeholders from previous runs + if line is None: + continue - to_quote_list.append((index, parsed_line)) - html_escaped_list.append(index) + # Ignore Escaped > + if line.startswith(BLOCKQUOTE_ESCAPE_DELIM): + text_lines[index] = line[1:] + create_blockquote(quote_type=quote_type) + continue - inside_blockquote = False + # Parse lines starting with delimiter + if line.startswith(start_delimiter): + endswith_delimiter = end_delimiter and line.endswith(end_delimiter) - create_blockquote(quote_type=" expandable") + # Indexes to skip in line + start_index = len(start_delimiter) + end_index = end_index = len(line) - len(end_delimiter) if endswith_delimiter else len(line) - if inside_blockquote: - parsed_line = html.escape(line) if strict else line - to_quote_list.append((index, parsed_line)) - html_escaped_list.append(index) + # Strip delimiters + delimiter_stripped_line = line[start_index:end_index] - # Handle Single line/Continued Quote - for index, line in enumerate(text_lines): - if line is None: - continue + # Escape if strict + parsed_line = html.escape(delimiter_stripped_line) if strict else delimiter_stripped_line + + # add to queue + to_quote_list.append((index, parsed_line)) - if line.startswith(BLOCKQUOTE_ESCAPE_DELIM): - text_lines[index] = line[1:] - create_blockquote() - continue + # save line index + html_escaped_list.append(index) - if line.startswith(BLOCKQUOTE_DELIM): - delim_stripped_line = line[1:] - parsed_line = ( - html.escape(delim_stripped_line) if strict else delim_stripped_line - ) + # if line doesn't end with delimiter continue loop + if not endswith_delimiter: + continue - to_quote_list.append((index, parsed_line)) - html_escaped_list.append(index) + # If line doesn't start with a delimiter + # or has ended with delimiter + # it means the block quote has ended + # create pending quotes if any + create_blockquote(quote_type=quote_type) - elif len(to_quote_list) > 0: - create_blockquote() - else: - create_blockquote() + else: + # is triggered when there's only one line of text + # the line above won't be triggered + # because loop will exit after first iteration + # so try to create quote if any in queue + create_blockquote(quote_type=quote_type) + + process_text( + start_delimiter=BLOCKQUOTE_EXPANDABLE_DELIM, + end_delimiter=BLOCKQUOTE_EXPANDABLE_OPTIONAL_END_DELIM, + quote_type=" expandable", + ) + process_text(start_delimiter=BLOCKQUOTE_DELIM) if strict: for idx, line in enumerate(text_lines): if idx not in html_escaped_list: text_lines[idx] = html.escape(line) - return "\n".join( - [valid_line for valid_line in text_lines if valid_line is not None] - ) + return "\n".join(filter(lambda x: x is not None, text_lines)) async def parse(self, text: str, strict: bool = False): text = self.escape_and_create_quotes(text, strict=strict) @@ -186,17 +189,13 @@ async def parse(self, text: str, strict: bool = False): continue if not is_emoji and text_url: - text = utils.replace_once( - text, full, URL_MARKUP.format(url, text_url), start - ) + text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start) continue if is_emoji: emoji = text_url emoji_id = url.lstrip("tg://emoji?id=") - text = utils.replace_once( - text, full, EMOJI_MARKUP.format(emoji_id, emoji), start - ) + text = utils.replace_once(text, full, EMOJI_MARKUP.format(emoji_id, emoji), start) continue if delim == BOLD_DELIM: @@ -237,96 +236,118 @@ async def parse(self, text: str, strict: bool = False): @staticmethod def unparse(text: str, entities: list): + """ + https://github.com/LonamiWebs/Telethon/blob/141b620/telethon/extensions/markdown.py#L137-L193 + + Performs the reverse operation to .parse(), effectively returning + markdown-like syntax given a normal text and its MessageEntity's. + + :param text: the text to be reconverted into markdown. + :param entities: list of MessageEntity's applied to the text. + :return: a markdown-like text representing the combination of both inputs. + """ + delimiters = { + MessageEntityType.BOLD: BOLD_DELIM, + MessageEntityType.ITALIC: ITALIC_DELIM, + MessageEntityType.UNDERLINE: UNDERLINE_DELIM, + MessageEntityType.STRIKETHROUGH: STRIKE_DELIM, + MessageEntityType.CODE: CODE_DELIM, + MessageEntityType.PRE: PRE_DELIM, + MessageEntityType.BLOCKQUOTE: BLOCKQUOTE_DELIM, + MessageEntityType.EXPANDABLE_BLOCKQUOTE: BLOCKQUOTE_EXPANDABLE_DELIM, + MessageEntityType.SPOILER: SPOILER_DELIM, + } + text = utils.add_surrogates(text) - entities_offsets = [] - - for entity in entities: - entity_type = entity.type - start = entity.offset - end = start + entity.length - - if entity_type == MessageEntityType.BOLD: - start_tag = end_tag = BOLD_DELIM - elif entity_type == MessageEntityType.ITALIC: - start_tag = end_tag = ITALIC_DELIM - elif entity_type == MessageEntityType.UNDERLINE: - start_tag = end_tag = UNDERLINE_DELIM - elif entity_type == MessageEntityType.STRIKETHROUGH: - start_tag = end_tag = STRIKE_DELIM - elif entity_type == MessageEntityType.CODE: - start_tag = end_tag = CODE_DELIM - elif entity_type == MessageEntityType.PRE: - language = getattr(entity, "language", "") or "" - start_tag = f"{PRE_DELIM}{language}\n" - end_tag = f"\n{PRE_DELIM}" - elif entity_type == MessageEntityType.BLOCKQUOTE: - start_tag = BLOCKQUOTE_DELIM + " " - end_tag = "" - blockquote_text = text[start:end] - lines = blockquote_text.split("\n") - last_length = 0 - for line in lines: - if len(line) == 0 and last_length == end: - continue - start_offset = start + last_length - last_length = last_length + len(line) - end_offset = start_offset + last_length - entities_offsets.append( - ( - start_tag, - start_offset, - ) + insert_at = [] + for i, entity in enumerate(entities): + s = entity.offset + e = entity.offset + entity.length + delimiter = delimiters.get(entity.type, None) + if delimiter: + if entity.type == MessageEntityType.PRE: + inside_blockquote = any( + blk_entity.offset <= s < blk_entity.offset + blk_entity.length + and blk_entity.offset < e <= blk_entity.offset + blk_entity.length + for blk_entity in entities + if blk_entity.type == MessageEntityType.BLOCKQUOTE ) - entities_offsets.append( - ( - end_tag, - end_offset, - ) + is_expandable = any( + blk_entity.offset <= s < blk_entity.offset + blk_entity.length + and blk_entity.offset < e <= blk_entity.offset + blk_entity.length + # and blk_entity.collapsed + for blk_entity in entities + if blk_entity.type == MessageEntityType.EXPANDABLE_BLOCKQUOTE ) - last_length = last_length + 1 - continue - elif entity_type == MessageEntityType.EXPANDABLE_BLOCKQUOTE: - start_tag = BLOCKQUOTE_EXPANDABLE_DELIM + " " - end_tag = " " + BLOCKQUOTE_EXPANDABLE_END_DELIM - elif entity_type == MessageEntityType.SPOILER: - start_tag = end_tag = SPOILER_DELIM - elif entity_type == MessageEntityType.TEXT_LINK: - url = entity.url - start_tag = "[" - end_tag = f"]({url})" - elif entity_type == MessageEntityType.TEXT_MENTION: - user = entity.user - start_tag = "[" - end_tag = f"](tg://user?id={user.id})" - elif entity_type == MessageEntityType.CUSTOM_EMOJI: - emoji_id = entity.custom_emoji_id - start_tag = "![" - end_tag = f"](tg://emoji?id={emoji_id})" + if inside_blockquote: + if is_expandable: + if entity.language: + open_delimiter = f"{delimiter}{entity.language}\n**>" + else: + open_delimiter = f"{delimiter}\n**>" + close_delimiter = f"\n**>{delimiter}" + else: + if entity.language: + open_delimiter = f"{delimiter}{entity.language}\n>" + else: + open_delimiter = f"{delimiter}\n>" + close_delimiter = f"\n>{delimiter}" + else: + if entity.language: + open_delimiter = f"{delimiter}{entity.language}\n" + else: + open_delimiter = f"{delimiter}\n" + close_delimiter = delimiter + insert_at.append((s, i, open_delimiter)) + insert_at.append((e, -i, close_delimiter)) + elif ( + entity.type != MessageEntityType.BLOCKQUOTE + and entity.type != MessageEntityType.EXPANDABLE_BLOCKQUOTE + ): + open_delimiter = delimiter + close_delimiter = delimiter + insert_at.append((s, i, open_delimiter)) + insert_at.append((e, -i, close_delimiter)) + else: + # Handle multiline blockquotes + text_subset = text[s:e] + lines = text_subset.splitlines() + for line_num, line in enumerate(lines): + line_start = s + sum(len(l) + 1 for l in lines[:line_num]) + if entity.type == MessageEntityType.EXPANDABLE_BLOCKQUOTE: + insert_at.append((line_start, i, BLOCKQUOTE_EXPANDABLE_DELIM)) + else: + insert_at.append((line_start, i, BLOCKQUOTE_DELIM)) + # No closing delimiter for blockquotes else: - continue - - entities_offsets.append( - ( - start_tag, - start, - ) - ) - entities_offsets.append( - ( - end_tag, - end, - ) - ) - - entities_offsets = map( - lambda x: x[1], - sorted( - enumerate(entities_offsets), key=lambda x: (x[1][1], x[0]), reverse=True - ), - ) - - for entity, offset in entities_offsets: - text = text[:offset] + entity + text[offset:] + url = None + is_emoji = False + if entity.type == MessageEntityType.TEXT_LINK: + url = entity.url + elif entity.type == MessageEntityType.TEXT_MENTION: + url = f"tg://user?id={entity.user.id}" + elif entity.type == MessageEntityType.CUSTOM_EMOJI: + url = f"tg://emoji?id={entity.custom_emoji_id}" + is_emoji = True + if url: + if is_emoji: + insert_at.append((s, i, "![")) + else: + insert_at.append((s, i, "[")) + insert_at.append((e, -i, f"]({url})")) + + insert_at.sort(key=lambda t: (t[0], t[1])) + while insert_at: + at, _, what = insert_at.pop() + + # If we are in the middle of a surrogate nudge the position by -1. + # Otherwise we would end up with malformed text and fail to encode. + # For example of bad input: "Hi \ud83d\ude1c" + # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF + while utils.within_surrogate(text, at): + at += 1 + + text = text[:at] + what + text[at:] return utils.remove_surrogates(text) diff --git a/pyrogram/parser/parser.py b/pyrogram/parser/parser.py index 0ce2b2375c..e2de12144e 100644 --- a/pyrogram/parser/parser.py +++ b/pyrogram/parser/parser.py @@ -1,5 +1,5 @@ # Pyrogram - Telegram MTProto API Client Library for Python -# Copyright (C) 2017-present Dan +# Copyright (C) 2017-present # # This file is part of Pyrogram. # diff --git a/pyrogram/parser/utils.py b/pyrogram/parser/utils.py index 32c81707f6..e011976943 100644 --- a/pyrogram/parser/utils.py +++ b/pyrogram/parser/utils.py @@ -1,5 +1,5 @@ # Pyrogram - Telegram MTProto API Client Library for Python -# Copyright (C) 2017-present Dan +# Copyright (C) 2017-present # # This file is part of Pyrogram. # @@ -39,3 +39,19 @@ def remove_surrogates(text): def replace_once(source: str, old: str, new: str, start: int): return source[:start] + source[start:].replace(old, new, 1) + + +def within_surrogate(text, index, *, length=None): + """ + https://github.com/LonamiWebs/Telethon/blob/63d9b26/telethon/helpers.py#L52-L63 + + `True` if ``index`` is within a surrogate (before and after it, not at!). + """ + if length is None: + length = len(text) + + return ( + 1 < index < len(text) and # in bounds + '\ud800' <= text[index - 1] <= '\udbff' and # previous is + '\ud800' <= text[index] <= '\udfff' # current is + )