84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
import os
|
|
import datetime
|
|
from bs4 import BeautifulSoup
|
|
from feedgen.feed import FeedGenerator
|
|
from pytz import UTC # Ensures timezone-aware datetime
|
|
|
|
# Configuration
|
|
BASE_URL = 'https://purplebored.pl'
|
|
POSTS_DIR = './blog/posts'
|
|
FEED_OUTPUT_DIR = './feeds'
|
|
AUTHOR_NAME = 'Purplebored'
|
|
AUTHOR_EMAIL = 'purplebored@posteo.com'
|
|
|
|
def parse_xhtml_post(filepath):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
soup = BeautifulSoup(f, 'lxml')
|
|
|
|
# Try <meta name="og:title" content="...">
|
|
og_title = soup.find('meta', attrs={'name': 'og:title'})
|
|
if og_title and og_title.get('content'):
|
|
title = og_title['content']
|
|
else:
|
|
# Fallback to <title> tag
|
|
title_tag = soup.find('title')
|
|
title = title_tag.get_text() if title_tag else 'Untitled'
|
|
|
|
# Parse <meta name="date" content="YYYY-MM-DD">
|
|
date_meta = soup.find('meta', {'name': 'date'})
|
|
if date_meta and date_meta.get('content'):
|
|
pub_date = datetime.datetime.strptime(date_meta['content'], '%Y-%m-%d')
|
|
else:
|
|
pub_date = datetime.datetime.fromtimestamp(os.path.getmtime(filepath))
|
|
|
|
pub_date = pub_date.replace(tzinfo=UTC)
|
|
|
|
# Extract article or body content
|
|
content = soup.find('article') or soup.find('body')
|
|
|
|
# Generate relative URL
|
|
rel_path = os.path.relpath(filepath, POSTS_DIR)
|
|
url = f'{BASE_URL}/blog/posts/{rel_path}'.replace('\\', '/')
|
|
|
|
return {
|
|
'title': title,
|
|
'url': url,
|
|
'date': pub_date,
|
|
'content': str(content)
|
|
}
|
|
|
|
def generate_feeds(posts):
|
|
fg = FeedGenerator()
|
|
fg.id(BASE_URL)
|
|
fg.title('Purplebored Blog')
|
|
fg.author({'name': AUTHOR_NAME, 'email': AUTHOR_EMAIL})
|
|
fg.link(href=BASE_URL, rel='alternate')
|
|
fg.language('en')
|
|
fg.description('A blog about snuff reviews and other thoughts.')
|
|
|
|
for post in sorted(posts, key=lambda x: x['date'], reverse=True):
|
|
fe = fg.add_entry()
|
|
fe.id(post['url'])
|
|
fe.title(post['title'])
|
|
fe.link(href=post['url'])
|
|
fe.published(post['date'])
|
|
fe.content(post['content'], type='xhtml')
|
|
|
|
os.makedirs(FEED_OUTPUT_DIR, exist_ok=True)
|
|
fg.rss_file(os.path.join(FEED_OUTPUT_DIR, 'feed.rss'))
|
|
fg.atom_file(os.path.join(FEED_OUTPUT_DIR, 'feed.atom'))
|
|
|
|
def main():
|
|
posts = []
|
|
for root, dirs, files in os.walk(POSTS_DIR):
|
|
for file in files:
|
|
if file.endswith('.xhtml'):
|
|
filepath = os.path.join(root, file)
|
|
post_data = parse_xhtml_post(filepath)
|
|
posts.append(post_data)
|
|
|
|
generate_feeds(posts)
|
|
print(f'✅ Generated {len(posts)} posts in RSS and Atom feeds.')
|
|
|
|
if __name__ == '__main__':
|
|
main()
|