My-Personal-Website/feed_gen.py

84 lines
2.6 KiB
Python

import os
import datetime
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
from pytz import UTC # Ensures timezone-aware datetime
# Configuration
BASE_URL = 'https://purplebored.pl'
POSTS_DIR = './blog/posts'
FEED_OUTPUT_DIR = './feeds'
AUTHOR_NAME = 'Purplebored'
AUTHOR_EMAIL = 'purplebored@posteo.com'
def parse_xhtml_post(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'lxml')
# Try <meta name="og:title" content="...">
og_title = soup.find('meta', attrs={'name': 'og:title'})
if og_title and og_title.get('content'):
title = og_title['content']
else:
# Fallback to <title> tag
title_tag = soup.find('title')
title = title_tag.get_text() if title_tag else 'Untitled'
# Parse <meta name="date" content="YYYY-MM-DD">
date_meta = soup.find('meta', {'name': 'date'})
if date_meta and date_meta.get('content'):
pub_date = datetime.datetime.strptime(date_meta['content'], '%Y-%m-%d')
else:
pub_date = datetime.datetime.fromtimestamp(os.path.getmtime(filepath))
pub_date = pub_date.replace(tzinfo=UTC)
# Extract article or body content
content = soup.find('article') or soup.find('body')
# Generate relative URL
rel_path = os.path.relpath(filepath, POSTS_DIR)
url = f'{BASE_URL}/blog/posts/{rel_path}'.replace('\\', '/')
return {
'title': title,
'url': url,
'date': pub_date,
'content': str(content)
}
def generate_feeds(posts):
fg = FeedGenerator()
fg.id(BASE_URL)
fg.title('Purplebored Blog')
fg.author({'name': AUTHOR_NAME, 'email': AUTHOR_EMAIL})
fg.link(href=BASE_URL, rel='alternate')
fg.language('en')
fg.description('A blog about snuff reviews and other thoughts.')
for post in sorted(posts, key=lambda x: x['date'], reverse=True):
fe = fg.add_entry()
fe.id(post['url'])
fe.title(post['title'])
fe.link(href=post['url'])
fe.published(post['date'])
fe.content(post['content'], type='xhtml')
os.makedirs(FEED_OUTPUT_DIR, exist_ok=True)
fg.rss_file(os.path.join(FEED_OUTPUT_DIR, 'feed.rss'))
fg.atom_file(os.path.join(FEED_OUTPUT_DIR, 'feed.atom'))
def main():
posts = []
for root, dirs, files in os.walk(POSTS_DIR):
for file in files:
if file.endswith('.xhtml'):
filepath = os.path.join(root, file)
post_data = parse_xhtml_post(filepath)
posts.append(post_data)
generate_feeds(posts)
print(f'✅ Generated {len(posts)} posts in RSS and Atom feeds.')
if __name__ == '__main__':
main()