#!/usr/bin/env bash
# shellcheck disable=SC2002,SC2001
#
# extract MediaWiki pages, strip out CSS styling, create Markdown
#
# SPDX-License-Identifier: MIT

# Two easy ways to get all page names - with and without Category used:
#
# Category: MediaWiki -> Special Pages -> 'Export'
# - type in category + add, sort, remove "Category:xxx" items
#  or
# No Category: MediaWiki -> Special Pages -> 'All Pages'
# - copy the page names to a text file

## PANDOC is required at the end, install the latest package
## from https://github.com/jgm/pandoc/releases

# MW site
SITE="https://mediawiki.site/wiki"
# file with page names, one per line
PAGE="./pages.txt"
# output directory for HTML, no trailing slash
OUTD="."

while IFS='' read -r page || [[ -n "$page" ]]; do
  echo "$page"
  _PG="${OUTD}/${page}.html"
  _MD="${OUTD}/${page}.md"
  # get the page
  rm -f "$_PG"
  rm -f "$_MD"
  curl -so "$_PG" "${SITE}/${page}?action=render"
  # convert code blocks to...code blocks (sigh)
  cat "$_PG" | sed -e :a -re 's/<pre style="white-space: pre-wrap;">(.*)?<\/pre>/<pre><code>\1<\/code><\/pre>/g;/<pre style="white-space: pre-wrap;">/N;//ba' > "${OUTD}/_xy.z"
  mv -f "${OUTD}/_xy.z" "$_PG"
  # strip out only (space)(class|id|style)=(.*)
  sed -i 's/\(<[^>]*\) \+class="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  sed -i 's/\(<[^>]*\) \+id="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  sed -i 's/\(<[^>]*\) \+style="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  # reference links have "rel=nofollow" in them
  sed -i 's/\(<[^>]*\) \+rel="[^"]*"\([^>]*>\)/\1\2/ig' "$_PG"
  # delete bare div/span open/close tags (stripped)
  sed -i 's/<[\/]*span>//ig' "$_PG"
  sed -i 's/<[\/]*div>//ig' "$_PG"
  # html comment at bottom injected by MW during render link above
  cat "$_PG" | sed -e :a -re 's/<!--.*?-->//g;/<!--/N;//ba' > "${OUTD}/_xy.z"
  mv -f "${OUTD}/_xy.z" "$_PG"
  # create Github flavored markdown
  pandoc --from html --to gfm --output "$_MD" "$_PG"
  _title=$(echo "$page" | sed -e 's/_/ /g')
  sed -i "1i# ${_title}\n" "$_MD"
done < "${PAGE}"