-
Notifications
You must be signed in to change notification settings - Fork 408
Expand file tree
/
Copy pathnews2rss.py
More file actions
executable file
·150 lines (115 loc) · 4.85 KB
/
news2rss.py
File metadata and controls
executable file
·150 lines (115 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!#!/usr/bin/env python3
# Converts the news page to an RSS feed
import os
import base64
import re
import argparse
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET
from urllib.parse import urljoin
def extract_canonical_url(soup):
"""Extract the canonical URL from the HTML content"""
canonical_link = soup.find("link", rel="canonical")
if canonical_link and canonical_link.get("href"):
return canonical_link["href"]
return None
def make_urls_absolute(soup, base_url):
"""Convert relative URLs to absolute URLs in the HTML content"""
# Handle links
for link in soup.find_all("a", href=True):
link["href"] = urljoin(base_url, link["href"])
# Handle images
for img in soup.find_all("img", src=True):
img["src"] = urljoin(base_url, img["src"])
return soup
def html_to_rss(html_content, feed_title, feed_description):
soup = BeautifulSoup(html_content, "html.parser")
# Extract page URL from canonical link
page_url = extract_canonical_url(soup)
if not page_url:
raise ValueError("No canonical URL found in the HTML content")
print(f"Found canonical URL: {page_url}")
# Convert relative URLs to absolute
soup = make_urls_absolute(soup, page_url)
# Create RSS root structure
rss = ET.Element("rss", version="2.0")
channel = ET.SubElement(rss, "channel")
# Channel metadata
ET.SubElement(channel, "title").text = feed_title
ET.SubElement(channel, "link").text = page_url
ET.SubElement(channel, "description").text = feed_description
ET.SubElement(channel, "language").text = "en-us"
ET.SubElement(channel, "lastBuildDate").text = datetime.now().strftime(
"%a, %d %b %Y %H:%M:%S +0000"
)
# Find all h3 entries (news items)
h3_tags = soup.find_all("h3", id=True)
for h3 in h3_tags:
# Extract date and title from h3
h3_id = h3.get("id", "")
date_match = re.match(r"(\d{4}-\d{2}-\d{2})", h3_id)
if not date_match:
print(f"Skipping h3 with id '{h3_id}' - no valid date found")
continue
date_str = date_match.group(1)
# Get title text (remove any code tags for cleaner title)
title_text = h3.get_text().strip()
# Remove the date prefix from title
title_clean = re.sub(r"^\d{4}-\d{2}-\d{2}\s*-\s*", "", title_text)
# Collect content paragraphs until next h3
content_parts = []
current = h3.next_sibling
while current and current.name != "h3":
content_parts.append(str(current))
current = current.next_sibling
content = "".join(content_parts)
# Create RSS item
item = ET.SubElement(channel, "item")
ET.SubElement(item, "title").text = title_clean
# Use the actual page URL for link and guid, not just the base URL
item_url = f"{page_url}#{h3_id}"
ET.SubElement(item, "link").text = item_url
ET.SubElement(item, "guid").text = item_url
# Convert date to RFC 822 format
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
rfc_date = date_obj.strftime("%a, %d %b %Y 00:00:00 +0000")
ET.SubElement(item, "pubDate").text = rfc_date
# Add content as CDATA
description = ET.SubElement(item, "description")
description.text = content
# Embed XSLT stylesheet
script_dir = os.path.dirname(os.path.abspath(__file__))
xsl_file = os.path.join(script_dir, "news2rss.xsl")
with open(xsl_file, 'r', encoding='utf-8') as f:
xsl_content = f.read()
xsl_b64 = base64.b64encode(xsl_content.encode('utf-8')).decode('ascii')
data_uri = f"data:text/xsl;base64,{xsl_b64}"
xml_str = ET.tostring(rss, encoding="unicode")
full_xml = '<?xml version="1.0" encoding="UTF-8"?>\n'
full_xml += f'<?xml-stylesheet type="text/xsl" href="{data_uri}"?>\n'
full_xml += xml_str
return full_xml
def main():
parser = argparse.ArgumentParser(description="Convert HTML page to RSS feed")
parser.add_argument(
"-f", "--file", help="Path to the HTML file to convert", required=True
)
parser.add_argument("-o", "--output", help="Output RSS file path", required=True)
parser.add_argument("--title", help="RSS feed title", default="MSYS2 News")
parser.add_argument(
"--description",
help="RSS feed description",
default="MSYS2 project news and updates",
)
args = parser.parse_args()
print(f"Reading HTML from file {args.file}...")
with open(args.file, "r", encoding="utf-8") as f:
html_content = f.read()
print("Converting to RSS feed...")
rss_content = html_to_rss(html_content, args.title, args.description)
with open(args.output, "w", encoding="utf-8") as f:
f.write(rss_content)
print(f"RSS feed saved to {args.output}")
if __name__ == "__main__":
main()