My-Personal-Website/feed_gen.py

import os
import datetime
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
from pytz import UTC  # Ensures timezone-aware datetime

# Configuration
BASE_URL = 'https://purplebored.pl'
POSTS_DIR = './blog/posts'
FEED_OUTPUT_DIR = './feeds'
AUTHOR_NAME = 'Purplebored'
AUTHOR_EMAIL = 'purplebored@posteo.com'

def parse_xhtml_post(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml')

    # Try <meta name="og:title" content="...">
    og_title = soup.find('meta', attrs={'name': 'og:title'})
    if og_title and og_title.get('content'):
        title = og_title['content']
    else:
        # Fallback to <title> tag
        title_tag = soup.find('title')
        title = title_tag.get_text() if title_tag else 'Untitled'

    # Parse <meta name="date" content="YYYY-MM-DD">
    date_meta = soup.find('meta', {'name': 'date'})
    if date_meta and date_meta.get('content'):
        pub_date = datetime.datetime.strptime(date_meta['content'], '%Y-%m-%d')
    else:
        pub_date = datetime.datetime.fromtimestamp(os.path.getmtime(filepath))

    pub_date = pub_date.replace(tzinfo=UTC)

    # Extract article or body content
    content = soup.find('article') or soup.find('body')

    # Generate relative URL
    rel_path = os.path.relpath(filepath, POSTS_DIR)
    url = f'{BASE_URL}/blog/posts/{rel_path}'.replace('\\', '/')

    return {
        'title': title,
        'url': url,
        'date': pub_date,
        'content': str(content)
    }

def generate_feeds(posts):
    fg = FeedGenerator()
    fg.id(BASE_URL)
    fg.title('Purplebored Blog')
    fg.author({'name': AUTHOR_NAME, 'email': AUTHOR_EMAIL})
    fg.link(href=BASE_URL, rel='alternate')
    fg.language('en')
    fg.description('A blog about snuff reviews and other thoughts.')

    for post in sorted(posts, key=lambda x: x['date'], reverse=True):
        fe = fg.add_entry()
        fe.id(post['url'])
        fe.title(post['title'])
        fe.link(href=post['url'])
        fe.published(post['date'])
        fe.content(post['content'], type='xhtml')

    os.makedirs(FEED_OUTPUT_DIR, exist_ok=True)
    fg.rss_file(os.path.join(FEED_OUTPUT_DIR, 'feed.rss'))
    fg.atom_file(os.path.join(FEED_OUTPUT_DIR, 'feed.atom'))

def main():
    posts = []
    for root, dirs, files in os.walk(POSTS_DIR):
        for file in files:
            if file.endswith('.xhtml'):
                filepath = os.path.join(root, file)
                post_data = parse_xhtml_post(filepath)
                posts.append(post_data)

    generate_feeds(posts)
    print(f'✅ Generated {len(posts)} posts in RSS and Atom feeds.')

if __name__ == '__main__':
    main()