-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblogger-to-md.py
137 lines (123 loc) · 4.67 KB
/
blogger-to-md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
How to use this script:
1. Go to your blogger account settings
2. Search for the "Back up content" link
3. Download the content as an XML file
4. Run the script with:
Usage: python legacy.py [OPTIONS] INPUT_FILE OUTPUT_DIR
Arguments:
INPUT_FILE [required]
OUTPUT_DIR [required]
Options:
--tag TEXT
Tag to add to frontmatter
[default: legacy-blogger]
--show-original / --no-show-original
Link MD files to original articles
[default: show-original]
TODOs
1. Remove the odd 'pydanny' specific items
2. Add pure python way to convert HTML to markdown
"""
import sys
from pathlib import Path
try:
import feedparser
import typer
import yaml
except ImportError:
print("Run 'pip install feedparser typer yaml'")
def main(
input_file: Path,
output_dir: Path,
tag: str = typer.Option("legacy-blogger", help="Tag to add to frontmatter"),
show_original: bool = typer.Option(True, help="Link MD files to original articles"),
):
typer.secho(f"Parsing data from '{input_file}'", fg=typer.colors.GREEN)
raw_text = input_file.read_text()
# parse the historical data
data = feedparser.parse(raw_text)
posts = {}
for entry in data.entries:
try:
# Filter out config data and other junk
if "tag:blogger.com" in entry.link:
continue
if "comments" in entry["href"]:
continue
if "#settings" in entry.category:
continue
if entry.title == "Template: pydanny":
continue
# add comments to entries
if "#comment" in entry.category:
posts[entry["thr_in-reply-to"]["href"]].comments.append(entry)
continue
# Add entries to the posts and prep for comments
entry["comments"] = []
posts[entry.link] = entry
except KeyError:
continue
# Write the markdown files
typer.secho(
f"Writing {len(posts)} blogger posts to markdown files", fg=typer.colors.GREEN
)
with typer.progressbar(posts.items()) as posts_progress:
for key, value in posts_progress:
# Get a MD filename from the original HTML URL
filename = key.replace(".html", ".md")
filename = filename.replace(data["feed"]["link"], "")
link = data["feed"]["link"].replace("http", "https")
filename = filename.replace(link, "")
# print('\n',link, data['feed']['link'], filename)
# Catches some of the configuration elements
if len(filename.strip()) == 0:
continue
# bypasses simple pages, TODO: Provide option to create MD pages
if filename.startswith("p-"):
continue
filename = filename.replace("/", "-")
# Get a list of tags
tags = [x["term"] for x in value.tags]
tags = [
x
for x in tags
if x != "https://schemas.google.com/blogger/2008/kind#post"
]
# Add the tag option to list of tags
tags.append(tag)
frontmatter = {
"date": value["published"],
"published": True,
"slug": filename.replace(".md", ""),
"tags": tags,
"time_to_read": 5,
"title": value["title"],
"description": "",
}
with open(f"{output_dir.joinpath(filename)}", "w") as f:
# Set the frontmatter
f.write("---\n")
f.write(yaml.dump(frontmatter))
f.write("---\n\n")
if show_original:
# Set a link to the original content
f.write(
f"*This was originally posted on blogger [here]({key})*.\n\n"
)
# Write the HTML, TODO: consider converting to markdown
f.write(value["summary"])
# If any comments, add them
if value["comments"]:
f.write("\n\n---\n\n")
f.write(
f'## {len(value["comments"])} comments captured from [original post]({key}) on Blogger\n\n'
)
for comment in value["comments"]:
f.write(
f"**{comment['author_detail']['name']} said on {comment['published'][:10]}**\n\n"
)
f.write(comment["summary"])
f.write("\n\n")
if __name__ == "__main__":
typer.run(main)