Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hacky special-case solution for links in feed excerpts #265

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions mkdocs_rss_plugin/hacky_fix_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re
from urllib.parse import urljoin

HREF_MATCH_PATTERN = re.compile('href="(.*?)"')
SRC_MATCH_PATTERN = re.compile('src="(.*?)"')


def relative_links_resolve_to_page(page_html, page_url):
href_links_to_replace = re.findall(HREF_MATCH_PATTERN, page_html)
src_links_to_replace = re.findall(SRC_MATCH_PATTERN, page_html)
links_to_replace = set(href_links_to_replace + src_links_to_replace)
links_with_replacements = [
(link, urljoin(page_url, link)) for link in links_to_replace
]
replaced_html = page_html
for original, replacement in links_with_replacements:
replaced_html = replaced_html.replace(original, replacement)
return replaced_html


WRAPPER_PATTERNS = [
re.compile(p, flags=re.DOTALL)
for p in [
'<a class="glightbox".*?>(.*?)</a>',
'<div class="grid cards".*?>(.*?)</div>',
]
]


def remove_wrappers(page_html):
for wrapper_pattern in WRAPPER_PATTERNS:
page_html = re.sub(wrapper_pattern, r"\1", page_html)
return page_html
1 change: 1 addition & 0 deletions mkdocs_rss_plugin/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ def on_page_content(
created=page_dates[0],
description=self.util.get_description_or_abstract(
in_page=page,
html=html,
chars_count=self.config.abstract_chars_count,
abstract_delimiter=self.config.abstract_delimiter,
),
Expand Down
18 changes: 10 additions & 8 deletions mkdocs_rss_plugin/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
REMOTE_REQUEST_HEADERS,
)
from mkdocs_rss_plugin.git_manager.ci import CiHandler
from mkdocs_rss_plugin.hacky_fix_links import (
relative_links_resolve_to_page,
remove_wrappers,
)
from mkdocs_rss_plugin.integrations.theme_material_social_plugin import (
IntegrationMaterialSocialCards,
)
Expand Down Expand Up @@ -475,6 +479,7 @@ def get_date_from_meta(
def get_description_or_abstract(
self,
in_page: Page,
html: str,
chars_count: int = 160,
abstract_delimiter: Optional[str] = None,
) -> str:
Expand Down Expand Up @@ -509,15 +514,12 @@ def get_description_or_abstract(
# If the abstract is cut by the delimiter
elif (
abstract_delimiter
and (
excerpt_separator_position := in_page.markdown.find(abstract_delimiter)
)
> -1
and (excerpt_separator_position := html.find(abstract_delimiter)) > -1
):
return markdown.markdown(
in_page.markdown[:excerpt_separator_position],
output_format="html5",
)
replaced_links = relative_links_resolve_to_page(html, in_page.canonical_url)
removed_wrappers = remove_wrappers(replaced_links)
return removed_wrappers[: removed_wrappers.find(abstract_delimiter)]

# Use first chars_count from the markdown
elif chars_count > 0 and in_page.markdown:
if len(in_page.markdown) <= chars_count:
Expand Down
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/fixtures/docs/blog/posts/sample_blog_post_internal_links.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
date: 2023-02-12
authors: [guts]
categories:
- Blog
---

# Blog sample with internal links

I'm a really short intro.

![here's an internal image](./assets/example_image.webp)

[Here's an internal link](./sample_blog_post.md)
and another
[Another link](../../index.md)

<!-- more -->

## This part won't show up in RSS feed

### What is Lorem Ipsum?

Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
37 changes: 36 additions & 1 deletion tests/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def test_simple_build_item_delimiter(self):

for feed_item in feed_parsed.entries:
if feed_item.title in ("Page without meta with early delimiter",):
self.assertLess(len(feed_item.description), 50, feed_item.title)
self.assertLess(len(feed_item.description), 100, feed_item.title)

def test_simple_build_item_delimiter_empty(self):
with tempfile.TemporaryDirectory() as tmpdirname:
Expand Down Expand Up @@ -904,6 +904,41 @@ def test_not_git_repo(self):
# restore name
git_dir_tmp.replace(git_dir)

def test_abstract_with_internal_links(self):
with tempfile.TemporaryDirectory() as tmpdirname:
cli_result = self.build_docs_setup(
testproject_path="docs",
mkdocs_yml_filepath=Path("tests/fixtures/mkdocs_minimal.yml"),
output_path=tmpdirname,
strict=True,
)
self.assertEqual(cli_result.exit_code, 0)
self.assertIsNone(cli_result.exception)

feed_rss_created = feedparser.parse(
Path(tmpdirname) / OUTPUT_RSS_FEED_CREATED
)

feed_rss_updated = feedparser.parse(
Path(tmpdirname) / OUTPUT_RSS_FEED_UPDATED
)

##print(json.dumps(feed_rss_created))

for page in feed_rss_created.entries + feed_rss_updated.entries:
if page.title == "Blog sample with internal links":
self.assertIn(
'href="https://guts.github.io/mkdocs-rss-plugin/blog/posts/sample_blog_post/"',
page.summary,
)
self.assertIn(
'href="https://guts.github.io/mkdocs-rss-plugin/"', page.summary
)
self.assertIn(
'src="https://guts.github.io/mkdocs-rss-plugin/blog/posts/assets/example_image.webp"',
page.summary,
)


# ##############################################################################
# ##### Stand alone program ########
Expand Down