53 lines
2 KiB
Bash
Executable file
53 lines
2 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# shellcheck disable=SC2002,SC2001
|
|
#
|
|
# extract MediaWiki pages, strip out CSS styling, create Markdown
|
|
#
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
# Two easy ways to get all page names - with and without Category used:
|
|
#
|
|
# Category: MediaWiki -> Special Pages -> 'Export'
|
|
# - type in category + add, sort, remove "Category:xxx" items
|
|
# or
|
|
# No Category: MediaWiki -> Special Pages -> 'All Pages'
|
|
# - copy the page names to a text file
|
|
|
|
## PANDOC is required at the end, install the latest package
|
|
## from https://github.com/jgm/pandoc/releases
|
|
|
|
# MW site
|
|
SITE="https://mediawiki.site/wiki"
|
|
# file with page names, one per line
|
|
PAGE="./pages.txt"
|
|
# output directory for HTML, no trailing slash
|
|
OUTD="."
|
|
|
|
while IFS='' read -r page || [[ -n "$page" ]]; do
|
|
echo "$page"
|
|
_PG="${OUTD}/${page}.html"
|
|
_MD="${OUTD}/${page}.md"
|
|
# get the page
|
|
rm -f "$_PG"
|
|
rm -f "$_MD"
|
|
curl -so "$_PG" "${SITE}/${page}?action=render"
|
|
# convert code blocks to...code blocks (sigh)
|
|
cat "$_PG" | sed -e :a -re 's/<pre style="white-space: pre-wrap;">(.*)?<\/pre>/<pre><code>\1<\/code><\/pre>/g;/<pre style="white-space: pre-wrap;">/N;//ba' > "${OUTD}/_xy.z"
|
|
mv -f "${OUTD}/_xy.z" "$_PG"
|
|
# strip out only (space)(class|id|style)=(.*)
|
|
sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
|
sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
|
sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
|
# reference links have "rel=nofollow" in them
|
|
sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
|
|
# delete bare div/span open/close tags (stripped)
|
|
sed -i 's/<[\/]*span>//ig' "$_PG"
|
|
sed -i 's/<[\/]*div>//ig' "$_PG"
|
|
# html comment at bottom injected by MW during render link above
|
|
cat "$_PG" | sed -e :a -re 's/<!--.*?-->//g;/<!--/N;//ba' > "${OUTD}/_xy.z"
|
|
mv -f "${OUTD}/_xy.z" "$_PG"
|
|
# create Github flavored markdown
|
|
pandoc --from html --to gfm --output "$_MD" "$_PG"
|
|
_title=$(echo "$page" | sed -e 's/_/ /g')
|
|
sed -i "1i# ${_title}\n" "$_MD"
|
|
done < "${PAGE}"
|