#!/bin/bash # Author: Gwern Branwen # Date: 2016-10-01 # When: Time-stamp: "2024-11-22 09:12:05 gwern" # License: CC-0 # # sync-gwern.net.sh: shell script which automates a full build and sync of Gwern.net. A full build is intricate, and requires several passes like generating link-bibliographies/tag-directories, running two kinds of syntax-highlighting, stripping cruft etc. # # This script automates all of that: it cleans up, compiles a hakyll binary for faster compilation, # generates a sitemap XML file, optimizes the MathJax use, checks for many kinds of errors, uploads, # and cleans up. # key dependencies: GHC, Hakyll, emacs, curl, tidy (HTML5 version), git, regex-compat-tdfa (Unicode Haskell regexps), urlencode # ('gridsite-clients' package), linkchecker, fdupes, ImageMagick, exiftool, mathjax-node-page (eg. # `npm i -g mathjax-node-page`), parallel, xargs, php-cli, php-xml, libreoffice, gifsicle, tidy, libxml2-utils… cd ~/wiki/ # shellcheck source=/home/gwern/wiki/static/build/bash.sh . ./static/build/bash.sh # import a bunch of Bash utilities for output formatting, checks, file IO etc: red/bold, wrap, gf/gfc/gfv/ge/gec/gev, png2JPGQualityCheck, gwmv... DEPENDENCIES=(bc curl dos2unix du elinks emacs exiftool fdupes feh ffmpeg file find firefox ghc ghci runghc hlint gifsicle git identify inotifywait jpegtran jq libreoffice linkchecker locate mogrify ocrmypdf pandoc parallel pdftk pdftotext php ping optipng rm rsync sed tidy urlencode x-www-browser xargs xmllint xprintidle static/build/anchor-checker.php static/build/generateBacklinks.hs static/build/generateDirectory.hs static/build/generateLinkBibliography.hs static/build/generateSimilarLinks.hs static/build/link-extractor.hs compressJPG2) # ~/src/node_modules/mathjax-node-page/bin/mjpage, beautifulsoup-4 DEPENDENCIES_MISSING=() for DEP in "${DEPENDENCIES[@]}"; do if ! command -v "$DEP" &> /dev/null; then DEPENDENCIES_MISSING+=("$DEP") fi done if [ ${#DEPENDENCIES_MISSING[@]} -ne 0 ]; then red "Error: missing dependencies!" echo "$DEPENDENCIES_MISSING" exit 1 fi # conda activate pytorch # set up virtual environment to get 'beautifulsoup-4' for Python utilities # cleanup: rm --recursive --force -- ./_cache/ ./_site/ if [ "$(df --block-size=G ~/ | awk 'NR==2 {print $4}' | sed 's/G//')" -lt 3 ]; then red "Error: Less than 3GB of free space in home directory; one cannot reliably compile Gwern.net with so little space, so exiting." >&2 exit 2 fi if [ -n "$(pgrep hakyll)" ] then red "or Hakyll already running?" else set -e # lower priority of everything we run (some of it is expensive): renice --priority 19 --pid "$$" &>/dev/null ionice --class 3 --pid "$$" &>/dev/null ## Parallelization: WARNING: post-2022-03 Hakyll uses parallelism which catastrophically slows down at >= # of physical cores; see N=6 SLOW="true" SKIP_DIRECTORIES="" TODAY=$(date '+%F') for ARG in "$@"; do case "$ARG" in --fast) SLOW="" ;; --skip-directories) SKIP_DIRECTORIES="true" ;; *[!0-9]*) ;; # skip non-numbers *) N="$ARG" ;; esac done export SLOW SKIP_DIRECTORIES N if [ "$SLOW" ]; then (cd ~/wiki/ && git status) || true; # quickly summarize pending changes bold "Checking metadata…" pkill checkMetadata || true rm ~/METADATA.txt &> /dev/null || true TMP_CHECK=$(mktemp /tmp/"XXXXX.txt"); ./static/build/checkMetadata >> "$TMP_CHECK" 2>&1 && mv "$TMP_CHECK" ~/METADATA.txt || true & fi & bold "Pulling infrastructure updates…" # pull from Said Achmiz's repo, with his edits overriding mine in any conflict (`-Xtheirs`) & auto-merging with the default patch text (`--no-edit`), to make sure we have the latest JS/CSS. (This is a bit tricky because the use of versioning in the includes means we get a lot of merge conflicts, for some reason.) (cd ./static/ && git status && timeout 5m git pull -Xtheirs --no-edit --verbose 'https://gwern.obormot.net/static/.git/' master) || true if [ "$SLOW" ]; then bold "Executing string rewrite cleanups…" # automatically clean up some Gwern.net bad URL patterns, typos, inconsistencies, house-styles: ( s() { gwsed "$@"; } set +e ## domain/URL rewrites: s 'https://mobile.x.com' 'https://x.com'; s 'https://www.x.com' 'https://x.com'; s 'https://twitter.com/' 'https://x.com/'; s 'https://en.reddit.com/' 'https://www.reddit.com/'; s 'https://www.greaterwrong.com/posts/' 'https://www.lesswrong.com/posts'; s 'http://web.archive.org/web/' 'https://web.archive.org/web/'; s 'https://youtu.be/' 'https://www.youtube.com/watch?v='; s 'http://arxiv.org' 'https://arxiv.org'; s 'https://deepmind.com' 'https://www.deepmind.com'; s 'http://en.wikipedia.org' 'https://en.wikipedia.org'; s 'v1.full' '.full'; s 'v2.full' '.full'; s 'v3.full' '.full'; s 'v4.full' '.full'; s 'v5.full' '.full'; s 'v6.full' '.full'; s 'v7.full' '.full'; s 'v8.full' '.full'; s 'v9.full' '.full'; s '.full-text' '.full'; s '.full.full' '.full'; s '.full-text' '.full'; s '.full-text.full' '.full'; s '.full.full.full' '.full'; s '.full.full' '.full'; s '.gov/labs/pmc/articles/P' '.gov/pmc/articles/P'; s 'rjlipton.wpcomstaging.com' 'rjlipton.wordpress.com'; s 'www.super-memory.com' 'super-memory.com'; s 'https://www.bldgblog.com' 'https://bldgblog.com'; s 'https://www.clinicaltrials.gov' 'https://clinicaltrials.gov'; s 'https://arxiv.org/abs//' 'https://arxiv.org/abs/'; s 'http://paulgraham.com' 'https://paulgraham.com'; s 'http://www.paulgraham.com' 'https://paulgraham.com'; s "https://www.paulgraham.com" "https://paulgraham.com"; s 'https://scribe.rip' 'https://www.freedium.cfd'; s 'https://www.arxiv.org/' 'https://arxiv.org/'; ## NOTE: domains which are bad or unfixable are handled by a later lint. This is only for safe rewrites. ## link cruft rewrites: s '&hl=en&oi=ao' ''; s '&hl=en' ''; s '?hl=en&' '?'; s '?hl=en' ''; s '?usp=sharing' ''; s '?via%3Dihub' ''; s '.html?pagewanted=all' '.html'; s '&feature=youtu.be' ''; s ':443/' '/'; s ':80/' '/'; s '?s=r' ''; s '?s=61' ''; s '?sd=pf' ''; s '?ref=The+Browser-newsletter' ''; s '?ref=thebrowser.com' ''; s '?ignored=irrelevant' ''; s '](/docs/' '](/doc/'; s 'href="/docs/' 'href="/doc/'; s '.pdf#pdf' '.pdf'; s '#fromrss' ''; s '&hl=en' ''; s '?rss=1' ''; s '/doc/statistics/decision-theory' '/doc/statistics/decision'; s '?ref=quillette.com' ''; s '?login=false' ''; s '?open=false#' '#'; stringReplace '&oi=ao' '' ./static/build/Config/Metadata/Author.hs; stringReplace '&hl=en' '' ./static/build/Config/Metadata/Author.hs; ## name/entity consistency: s 'EMBASE' 'Embase'; s 'Medline' 'MEDLINE'; s 'PsychINFO' 'PsycINFO'; s 'MSCOCO' 'MS COCO'; s 'Yann Le Cun' 'Yann LeCun'; s ' VQVAE' ' VQ-VAE'; s 'CIFAR 10' 'CIFAR-10'; s 'Jorges Luis Borges' 'Jorge Luis Borges'; s 'Rene Girard' 'René Girard'; s 'Anno Hideaki' 'Hideaki Anno'; s ' GPT2' ' GPT-2'; s ' Clinicaltrials.gov' ' ClinicalTrials.gov'; s ' clinicaltrials.gov' ' ClinicalTrials.gov'; s 'Dario Amodai' 'Dario Amodei'; s 'single nucleotide polymorph' 'single-nucleotide polymorph'; s 'Single Nucleotide Polymorph' 'Single-Nucleotide Polymorph'; s 'single nucleotide variant' 'single-nucleotide variant'; s ' CIFAR10' 'CIFAR-10'; s 'TyDi QA' 'TyDiQA'; s 'Türkiye' 'Turkey'; s ' Poincare' ' Poincaré'; s 'Francois de La Rochefoucauld' 'François de La Rochefoucauld'; s 'Moliere' 'Molière'; s 'behavioural genetic' 'behavioral genetic'; s ' gwern.net' ' Gwern.net'; s 'chain of thought' 'chain-of-thought'; s 'Chain Of Thought' 'Chain-Of-Thought'; s 'Chain of Thought' 'Chain-of-Thought'; s 'Chain of thought' 'Chain-of-thought'; s 'MS Marco' 'MS MARCO'; s 'MS-MARCO' 'MS MARCO'; s 'NLSY-79' 'NLSY79'; s 'NLSY-97' 'NLSY97'; s 'state of the art' 'state-of-the-art'; s 'State of the Art' 'State-of-the-Art'; s 'State of the art' 'State-of-the-art'; s 'State Of The Art' 'State-of-the-Art'; s 'Enwik8' 'enwik8'; s 'enwiki8' 'enwik8'; s 'G. M. Fahy' 'Gregory M. Fahy'; s 'Greg M. Fahy' 'Gregory M. Fahy'; s 'Gary Kasparov' 'Garry Kasparov'; s 'Fel D1' 'Fel D 1'; s 'Fel d1' 'Fel d 1'; s 'CIFAR10' 'CIFAR-10'; s 'ImageNet1k' 'ImageNet-1k'; s 'ImageNet21k' 'ImageNet-21k'; s ' LeGuin' ' Le Guin'; s 'DALL-E 1' 'DALL·E 1'; s 'DALL-E 2' 'DALL·E 2'; s 'DALLE-2 ' 'DALL·E 2 '; s 'DALL-E 3' 'DALL·E 3'; s 'FLAN-PALM' 'Flan-PaLM'; s 'GPT-4V' 'GPT-4-V'; s 'GPT-4 V' 'GPT-4-V'; s ' GPT4' ' GPT-4'; s 'drop cap' 'dropcap'; s 'Drop cap' 'Dropcap'; s 'Drop Cap' 'Dropcap'; s 'R.A. Fisher' 'R. A. Fisher'; s 'Larry Sumners' 'Larry Summers'; s ' auto-encoder' ' autoencoder'; s 'Auto-Encoder' 'Autoencoder'; s ' GPT3' ' GPT-3' ; s ' GPT4' ' GPT-4'; ## abbreviation consistency: s '(ie,' '(ie.'; s '(ie ' '(ie. '; s 'i.e.,' 'ie.'; s 'ie., ' 'ie. '; s '(i.e.' '(ie.'; s '(eg, ' '(eg. '; s ' eg ' ' eg. '; s '(eg ' '(eg. '; s '[eg ' '[eg. '; s '[Eg ' '[eg. '; s 'e.g. ' 'eg. '; s ' e.g. ' ' eg. '; s 'e.g.,' 'eg.'; s 'eg.,' 'eg.'; s 'E.g.,' 'Eg.'; s '(cf ' '(cf. '; s ' cf ' ' cf. '; s ' Feb ' ' February '; s ' Aug ' ' August '; s ', Jr.' ' Junior'; s ' Jr.' ' Junior'; s ', Junior' ' Junior'; s 'Th' 'th'; s ' 20th' ' 20th'; s ' 21st' ' 21st'; s ',”' '”,'; s ",’" "’,"; ### NOTE: Not safe to do site-wide with `gwsed` because it stomps all over R transcripts where quartiles ### are often reported in summaries like '1st'; we can do it safely for GTX because no R sessions there (for now): stringReplace 'St' 'st' ./metadata/*.gtx; stringReplace 'Nd' 'nd' ./metadata/*.gtx; stringReplace 'Rd' 'rd' ./metadata/*.gtx; stringReplace ' 1st ' ' 1st ' ./metadata/*.gtx; stringReplace ' 2nd' ' 2nd' ./metadata/*.gtx; stringReplace ' 3rd' ' 3rd' ./metadata/*.gtx; stringReplace ' 4th' ' 4th' ./metadata/*.gtx; ## spelling errors: s 'border colly' 'border collie'; s 'genomewide' 'genome-wide'; s 'regularise' 'regularize'; s ' residualis' ' residualiz'; s 'endelian randomisation' 'endelian randomization'; s 'mendelian randomization' 'Mendelian Randomization'; s 'Mendelian randomization' 'Mendelian Randomization'; s 'canalization' 'canalisation'; s 'Statistical significance' 'Statistical-significance'; s 'Statistical Significance' 'Statistical-Significance'; s 'statistical significance' 'statistical-significance'; s ' longstanding' ' long-standing'; s 'utilise' 'utilize'; s 'facebookok' 'facebook'; s 'Tartarian' 'Tatarian'; s 'tartarian' 'tatarian'; s ' an One' ' a One'; s ' an one' ' a one'; s '

he ' '

He '; s ' lik ' ' like '; s ' Behaviour ' ' Behavior '; s ' behaviour ' ' behavior '; s ' anaesthesia' ' anesthesia'; s ' Modelling' ' Modeling'; s ' modelling' ' modeling'; ## citation consistency: s ']^[' '] ^['; s 'et. al.' 'et al'; s 'et al. (' 'et al ('; s ' et al. 1' ' et al 1'; s ' et al. 2' ' et al 2'; s ' et al., ' ' et al '; s 'et al., ' 'et al '; ### WARNING: when using `+` in sed, by default, it is treated as an ordinary literal. It MUST be escaped to act as a regexp! Whereas in `grep --extended-regexp`, it's the opposite. So remember: `\+` in sed, and `+` in grep. ### WARNING: remember that `sed -i` modifies the last-modified timestamp of all files it runs on, even when the file was not, in fact, modified! for file in $(find . -type f -name "*.md" -or -name "*.gtx"); do if grep -qE "[A-Z][a-z]+ et al \([1-2][0-9]{3}[a-z]?\)" "$file"; then sed -i -e 's/\([A-Z][a-z]\+\) et al (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 et al \2/g' "$file" fi if grep -qE "[A-Z][a-z]+ and [A-Z][a-z]+ \([1-2][0-9]{3}[a-z]?\)" "$file"; then sed -i -e 's/\([A-Z][a-z]\+\) and \([A-Z][a-z]\+\) (\([1-2][0-9][0-9][0-9][a-z]\?\))/\1 \& \2 \3/g' "$file" fi done ## anchor errors: s '#allen#allen' '#allen'; s '#deepmind#deepmind' '#deepmind'; s '&org=deepmind&org=deepmind' '&org=deepmind'; s '#nvidia#nvidia' '#nvidia'; s '#openai#openai' '#openai'; s '#google#google' '#google'; s '#uber#uber' '#uber'; ## HTML/Markdown formatting: s '

' '

'; s ' _n_s' ' ns'; s ' (n = ' ' (n = '; s ' (N = ' ' (n = '; s ' de novo ' ' de novo '; s ' De Novo ' ' De Novo '; s '. De novo ' '. De novo '; s 'backlinks-not' 'backlink-not'; s ',' ','; s ': ' ': '; s ';' ';'; s ' <Xs'; s ' _r_s' ' rs'; s ''; s '# External links' '# External Links'; s '# See also' '# See Also'; s '"abstract-collapse abstract"' '"abstract abstract-collapse"'; s "‐" "-"; s 'class="link-auto"' ''; s '𝑂(' '𝒪('; s ' and ' ' & '; s '' ''; s '' ''; s 'class="invertible"' 'class="invert"'; s '”>' '">'; s '
' '
'; s '
' '
'; s ' id="cb1"' ''; s ' id="cb2"' ''; s ' id="cb3"' ''; s ' id="cb4"' ''; s '.svg-530px.jpg' '.svg'; s ' (”' ' (“'; s '
’s' '’s'; s '-530px.jpg' ''; s '-768px.png' ''; s '-768px.jpg' ''; s '—-' '—'; s 'collapse-summary' 'abstract-collapse'; s 'collapse-abstract' 'abstract-collapse'; s 'href="ttp' 'href="http'; s '\xmlpi{\\}' ''; s '°C' '℃'; s '° C' '℃'; s '°F' '℉'; s '° F' '℉'; s '℉ahrenheit' '℉'; s '℃elsius' '℃'; s ' ℃' '℃'; s ' ℉' '℉'; s 'marginnnote' 'marginnote'; s '
' ''; s '
' ''; s '::' '
:'; s '](//' '[(/'; s '{.full-width' '{.width-full'; s '

' '
'; s '](/home/gwern/wiki/' '](/'; s '](wiki/' '](/'; s '' '’.

'; s 'Cite-Author' 'cite-author'; s 'cite-author-Plural' 'cite-author-plural'; s 'Cite-Date' 'cite-date'; s 'Cite-Joiner' 'cite-joiner'; s 'class="Cite' 'class="cite'; s 'Logotype-Tex' 'logotype-tex'; s 'Date-Range' 'date-range'; s 'Inflation-Adjusted' 'inflation-adjusted'; s 'Logotype-Latex-A' 'logotype-latex-a'; s 'Logotype-Latex-E' 'logotype-latex-e'; s 'SUbsup' 'subsup'; s '

' '

'; s '’ ”' '’ ”'; s ' ”' ' “'; s '[("doi","")]' ''; s '>/a>' '
'; s 'href="W!"' 'href="!W"'; s 'class="Logotype-Tex"' 'class="logotype-tex"'; s 'Class="Logotype-Tex"' 'class="logotype-tex"'; s 'n
th'; s 'thumbnailText: ' 'thumbnail-text: '; s ' — ' '—'; s '_n_=' '_n_ = '; s '< a href' 'pandoc 3.1.1 (2023-03-05), so can remove these two rewrites once I upgrade past that: s 'class="odd odd' 'class="odd'; s 'class="even even' 'class="even'; s '  ' ' '; s '​ ' ' '; ) &> /dev/null & sed -i -e 's/ data-link-?[Tt]ags="[a-z0-9 \/-]\+">/>/' ./metadata/*.gtx; fi bold "Compiling…" cd ./static/build WARNINGS="" if [ "$SLOW" ]; then WARNINGS="-Wall -Werror"; fi compile () { ghc -O2 $WARNINGS -rtsopts -threaded --make "$@"; } compile hakyll.hs if [ -z "$SKIP_DIRECTORIES" ]; then compile generateLinkBibliography.hs compile generateDirectory.hs; fi compile preprocess-markdown.hs compile guessTag.hs & compile changeTag.hs & compile checkMetadata.hs & compile generateSimilarLinks.hs & ## NOTE: the generateSimilarLinks & link-suggester.hs runs are done at midnight by a cron job because ## they are too slow to run during a regular site build & don't need to be super-up-to-date ## anyway cd ../../ if [ "$SLOW" ]; then bold "Checking embeddings database…" ghci -istatic/build/ ./static/build/GenerateSimilar.hs -e 'e <- readEmbeddings' &>/dev/null # duplicates a later check but if we have a fatal link error, we'd rather find out now rather than 30 minutes later while generating annotations: λ(){ gf -e 'href=""' -e 'href="!W">' -e "href='!W'>" -- ./metadata/*.gtx || true; } wrap λ "Malformed empty link in annotations?" # another early fatal check: if there is a Markdown file 'foo.md' and also a subdirectory 'foo/' in the same directory, then this will result in, later, a fatal error when one tries to compile 'foo.md' → 'foo' (the HTML file) but 'foo' (the directory) already exists. # Check if any files collide with directories of the same name (without the .md extension). # Usage: find_colliding_files [path] function find_colliding_files() { # GPT-3 written: set -euo pipefail path="${1:-.}" find "$path" -depth -type f -name "*.md" -exec sh -c ' for file do path="$(dirname "$file")/$(basename "$file" ".md")" if [ -e "$path" ] && [ ! -L "$path" ]; then if [ -d "$path" ]; then printf "Fatal error: Directory exists with the same name as file %s\n" "$file" >&2 exit 3 else printf "Fatal error: File exists with the same name as file %s\n" "$file" >&2 exit 4 fi fi done' sh {} + } find_colliding_files ./ # We update the linkSuggestions.el in a cron job because too expensive, and vastly slows down build. # Update the directory listing index pages: there are a number of directories we want to avoid, # like the various mirrors or JS projects, or directories just of data like CSVs, or dumps of # docs, so we'll blacklist those: DIRECTORY_TAGS="$(find doc/ fiction/ haskell/ newsletter/ nootropic/ note/ review/ sicp/ zeo/ -type d \ | gfv -e 'doc/www' -e 'doc/rotten.com' -e 'doc/genetics/selection/www.mountimprobable.com' \ -e 'doc/biology/2000-iapac-norvir' -e 'doc/gwern.net-gitstats' -e 'doc/reinforcement-learning/armstrong-controlproblem' \ -e 'doc/statistics/order/beanmachine-multistage' -e 'doc/personal/2011-gwern-yourmorals.org/' \ -e 'confidential/' -e 'private/' -e 'secret/' -e 'newest/' | \ sort_by_lastmodified)" if [ -z "$SKIP_DIRECTORIES" ]; then bold "Writing missing annotations to support link-bibliography/tag-directory updates…" # We add new annotations daily, but all the code in link-bib/tag-directory deal with only the current annotations which have been written out to disk as HTML snippets; thus, since that is done in the main compilation phase, the default would be that annotations would be omitted the first day and only appear the next time. This is annoying and manually working around it is even more tedious, so we provide a 'one-shot' missing-annotation mode and call that phase immediately before the lb/tag phase: ./static/build/hakyll build +RTS -N"$N" -RTS --annotation-missing-one-shot ; ./static/build/hakyll build clean bold "Updating link bibliographies…" ./static/build/generateLinkBibliography +RTS -N"$N" -RTS || true # we want to generate all directories first before running Hakyll in case a new tag was created bold "Building directory indexes…" ./static/build/generateDirectory +RTS -N2 -RTS $DIRECTORY_TAGS # ensure that the list of test-cases has been updated so we can look at immediately after the current sync (rather than afterwards, delaying it to after the next sync) λ() { ghci -istatic/build/ ./static/build/LinkLive.hs \ -e 'do { l <- linkLivePrioritize; putStrLn (Text.Show.Pretty.ppShow l); }' | \ gfv -e ' secs,' -e 'it :: ()' -e '[]'; } wrap λ "Need link live whitelist/blacklisting?" & else # we don't rebuild *all* tag-directories, but we will build any empty ones (ie. newly-created ones), because otherwise they will kill a fast sync (and I'd often forget at night that it has to be a full sync after creating a tag during the day): DIRECTORIES_EMPTY="$(find ./doc/ -type f -name "index.md" -size 0)" [ -n "$DIRECTORIES_EMPTY" ] && runghc -istatic/build/ ./static/build/generateDirectory $DIRECTORIES_EMPTY fi fi bold "Check & update VCS…" (ping -q -c 5 google.com &> /dev/null && cd ./static/ && git status; git pull; git push --verbose &) || true # Cleanup pre: rm --recursive --force ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks || true cd ~/wiki/ # go to site root bold "Building site…" # make sure all videos have 'poster' preview images: for VIDEO in $(find . -type f -name "*.mp4" -or -name "*.webm" -or -name "*.avi" | gfv "doc/www/"); do # we skip posters for videos in /doc/www/* archives from split archives because nothing sets a poster on them, so just a waste of space POSTER="$VIDEO-poster.jpg"; if [ ! -f "$POSTER" ]; then echo "Generating poster image for $VIDEO…" # Problem: embedded videos (e.g. https://gwern.net/lorem-multimedia#video ) all look like generic small black rectangles. User has no idea what it is until they click to begin download the (possibly huge) video file. This also causes layout shift as the `