#!/usr/bin/env bash # shellcheck disable=SC2002,SC2001 # # extract MediaWiki pages, strip out CSS styling, create Markdown # # SPDX-License-Identifier: MIT # Two easy ways to get all page names - with and without Category used: # # Category: MediaWiki -> Special Pages -> 'Export' # - type in category + add, sort, remove "Category:xxx" items # or # No Category: MediaWiki -> Special Pages -> 'All Pages' # - copy the page names to a text file ## PANDOC is required at the end, install the latest package ## from https://github.com/jgm/pandoc/releases # MW site SITE="https://mediawiki.site/wiki" # file with page names, one per line PAGE="./pages.txt" # output directory for HTML, no trailing slash OUTD="." while IFS='' read -r page || [[ -n "$page" ]]; do echo "$page" _PG="${OUTD}/${page}.html" _MD="${OUTD}/${page}.md" # get the page rm -f "$_PG" rm -f "$_MD" curl -so "$_PG" "${SITE}/${page}?action=render" # convert code blocks to...code blocks (sigh) cat "$_PG" | sed -e :a -re 's/
(.*)?<\/pre>/
\1<\/code><\/pre>/g;/
/N;//ba' > "${OUTD}/_xy.z"
  mv -f "${OUTD}/_xy.z" "$_PG"
  # strip out only (space)(class|id|style)=(.*)
  sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  # reference links have "rel=nofollow" in them
  sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  # delete bare div/span open/close tags (stripped)
  sed -i 's/<[\/]*span>//ig' "$_PG"
  sed -i 's/<[\/]*div>//ig' "$_PG"
  # html comment at bottom injected by MW during render link above
  cat "$_PG" | sed -e :a -re 's///g;/