#!/usr/bin/env bash # shellcheck disable=SC2002,SC2001 # # extract MediaWiki pages, strip out CSS styling, create Markdown # # SPDX-License-Identifier: MIT # Two easy ways to get all page names - with and without Category used: # # Category: MediaWiki -> Special Pages -> 'Export' # - type in category + add, sort, remove "Category:xxx" items # or # No Category: MediaWiki -> Special Pages -> 'All Pages' # - copy the page names to a text file ## PANDOC is required at the end, install the latest package ## from https://github.com/jgm/pandoc/releases # MW site SITE="https://mediawiki.site/wiki" # file with page names, one per line PAGE="./pages.txt" # output directory for HTML, no trailing slash OUTD="." while IFS='' read -r page || [[ -n "$page" ]]; do echo "$page" _PG="${OUTD}/${page}.html" _MD="${OUTD}/${page}.md" # get the page rm -f "$_PG" rm -f "$_MD" curl -so "$_PG" "${SITE}/${page}?action=render" # convert code blocks to...code blocks (sigh) cat "$_PG" | sed -e :a -re 's/
(.*)?<\/pre>/\1<\/code><\/pre>/g;//N;//ba' > "${OUTD}/_xy.z" mv -f "${OUTD}/_xy.z" "$_PG" # strip out only (space)(class|id|style)=(.*) sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG" sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG" sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG" # reference links have "rel=nofollow" in them sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG" # delete bare div/span open/close tags (stripped) sed -i 's/<[\/]*span>//ig' "$_PG" sed -i 's/<[\/]*div>//ig' "$_PG" # html comment at bottom injected by MW during render link above cat "$_PG" | sed -e :a -re 's///g;/