scripts/shell/mw2md.sh
2024-03-20 11:28:46 -05:00

53 lines
2 KiB
Bash
Executable file

#!/usr/bin/env bash
# shellcheck disable=SC2002,SC2001
#
# extract MediaWiki pages, strip out CSS styling, create Markdown
#
# SPDX-License-Identifier: MIT
# Two easy ways to get all page names - with and without Category used:
#
# Category: MediaWiki -> Special Pages -> 'Export'
# - type in category + add, sort, remove "Category:xxx" items
# or
# No Category: MediaWiki -> Special Pages -> 'All Pages'
# - copy the page names to a text file
## PANDOC is required at the end, install the latest package
## from https://github.com/jgm/pandoc/releases
# MW site
SITE="https://mediawiki.site/wiki"
# file with page names, one per line
PAGE="./pages.txt"
# output directory for HTML, no trailing slash
OUTD="."
while IFS='' read -r page || [[ -n "$page" ]]; do
echo "$page"
_PG="${OUTD}/${page}.html"
_MD="${OUTD}/${page}.md"
# get the page
rm -f "$_PG"
rm -f "$_MD"
curl -so "$_PG" "${SITE}/${page}?action=render"
# convert code blocks to...code blocks (sigh)
cat "$_PG" | sed -e :a -re 's/<pre style="white-space: pre-wrap;">(.*)?<\/pre>/<pre><code>\1<\/code><\/pre>/g;/<pre style="white-space: pre-wrap;">/N;//ba' > "${OUTD}/_xy.z"
mv -f "${OUTD}/_xy.z" "$_PG"
# strip out only (space)(class|id|style)=(.*)
sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
# reference links have "rel=nofollow" in them
sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
# delete bare div/span open/close tags (stripped)
sed -i 's/<[\/]*span>//ig' "$_PG"
sed -i 's/<[\/]*div>//ig' "$_PG"
# html comment at bottom injected by MW during render link above
cat "$_PG" | sed -e :a -re 's/<!--.*?-->//g;/<!--/N;//ba' > "${OUTD}/_xy.z"
mv -f "${OUTD}/_xy.z" "$_PG"
# create Github flavored markdown
pandoc --from html --to gfm --output "$_MD" "$_PG"
_title=$(echo "$page" | sed -e 's/_/ /g')
sed -i "1i# ${_title}\n" "$_MD"
done < "${PAGE}"