adding mw2md
This commit is contained in:
parent
357a84a32c
commit
9c0c2387c1
1 changed files with 53 additions and 0 deletions
53
shell/mw2md.sh
Executable file
53
shell/mw2md.sh
Executable file
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env bash
|
||||
# shellcheck disable=SC2002,SC2001
|
||||
#
|
||||
# extract MediaWiki pages, strip out CSS styling, create Markdown
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# Two easy ways to get all page names - with and without Category used:
|
||||
#
|
||||
# Category: MediaWiki -> Special Pages -> 'Export'
|
||||
# - type in category + add, sort, remove "Category:xxx" items
|
||||
# or
|
||||
# No Category: MediaWiki -> Special Pages -> 'All Pages'
|
||||
# - copy the page names to a text file
|
||||
|
||||
## PANDOC is required at the end, install the latest package
|
||||
## from https://github.com/jgm/pandoc/releases
|
||||
|
||||
# MW site
|
||||
SITE="https://mediawiki.site/wiki"
|
||||
# file with page names, one per line
|
||||
PAGE="./pages.txt"
|
||||
# output directory for HTML, no trailing slash
|
||||
OUTD="."
|
||||
|
||||
while IFS='' read -r page || [[ -n "$page" ]]; do
|
||||
echo "$page"
|
||||
_PG="${OUTD}/${page}.html"
|
||||
_MD="${OUTD}/${page}.md"
|
||||
# get the page
|
||||
rm -f "$_PG"
|
||||
rm -f "$_MD"
|
||||
curl -so "$_PG" "${SITE}/${page}?action=render"
|
||||
# convert code blocks to...code blocks (sigh)
|
||||
cat "$_PG" | sed -e :a -re 's/<pre style="white-space: pre-wrap;">(.*)?<\/pre>/<pre><code>\1<\/code><\/pre>/g;/<pre style="white-space: pre-wrap;">/N;//ba' > "${OUTD}/_xy.z"
|
||||
mv -f "${OUTD}/_xy.z" "$_PG"
|
||||
# strip out only (space)(class|id|style)=(.*)
|
||||
sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
||||
sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
||||
sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
||||
# reference links have "rel=nofollow" in them
|
||||
sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
||||
# delete bare div/span open/close tags (stripped)
|
||||
sed -i 's/<[\/]*span>//ig' "$_PG"
|
||||
sed -i 's/<[\/]*div>//ig' "$_PG"
|
||||
# html comment at bottom injected by MW during render link above
|
||||
cat "$_PG" | sed -e :a -re 's/<!--.*?-->//g;/<!--/N;//ba' > "${OUTD}/_xy.z"
|
||||
mv -f "${OUTD}/_xy.z" "$_PG"
|
||||
# create Github flavored markdown
|
||||
pandoc --from html --to gfm --output "$_MD" "$_PG"
|
||||
_title=$(echo "$page" | sed -e 's/_/ /g')
|
||||
sed -i "1i# ${_title}\n" "$_MD"
|
||||
done < "${PAGE}"
|
||||
Loading…
Add table
Add a link
Reference in a new issue