#!/usr/bin/env bash set -euo pipefail # ----------------------------------------------------------------------------- # Hugo selective output emission script # ----------------------------------------------------------------------------- # # Purpose # ------- # This script determines which files in the `public/` directory should be # emitted (e.g. for deployment or syncing). # # It runs BEFORE the Hugo build and therefore predicts some outputs based on # `content/` rather than reading them from `public/`. # # ----------------------------------------------------------------------------- # Modes # ----------------------------------------------------------------------------- # # 1. GLOBAL_OUTPUTS # Always emitted if present in `public/` # # 2. RULE-BASED ENTRIES # Format: # || # # ----------------------------------------------------------------------------- # 3. SOURCE PATTERN MODE (PRE-BUILD SAFE) # ----------------------------------------------------------------------------- # # Format: # || # # Example: # index.html,index.xml|content/archive/**/_index.md|include-parents # # Behavior: # # - Scans content/ (NOT public/) # - Predicts output paths in public/ # - Uses presence-based detection (manifest) # # ----------------------------------------------------------------------------- # Pattern precedence (IMPORTANT) # ----------------------------------------------------------------------------- # # More specific patterns automatically override broader ones. # # Example: # # content/archive/articles/**/_index.md (more specific) # content/archive/**/_index.md (less specific) # # The broader pattern will automatically IGNORE anything matched by the # more specific one. This prevents duplicate or conflicting emissions. # # This behavior is automatic and does NOT depend on array order. # # ----------------------------------------------------------------------------- CONTENT_DIR="content/posts" PUBLIC_DIR="public" CHECK_NEWEST="${CHECK_NEWEST:-10}" STATE_MANIFEST=".hugo-extra-output-manifest" DEBUG=0 TEST_MODE=0 for arg in "$@"; do case "$arg" in --debug) DEBUG=1 ;; --test) TEST_MODE=1 ;; *) echo "Unknown argument: $arg" >&2 exit 1 ;; esac done debug() { if [ "$DEBUG" -eq 1 ]; then printf "[debug] %s\n" "$*" >&2 fi } GLOBAL_OUTPUTS=( "index.html" "index.xml" "index-full.xml" "feed.json" "articles.html" "articles-full.xml" "articles-title.xml" "api/index.html" "api/index.xml" "api/reading/index.json" "feeds/index.html" "tags/index.json" "tags/index.html" "lastbuild.txt" "sitemap.xml" ) EXTRA_OUTPUTS=( "reading/index.html|content/reading.md|tags=book-start,books-read,review|categories=book-start,book-stop" "reading-series/index.html|content/reading-series.md" "links/index.html|content/links.md|categories=linkblog" "tools/index.html|content/tools.md" "about/index.html|content/about.md|tags=musical,books-read" "start_here/index.html|content/start_here.md" "index.html,index.xml|content/archive/articles/**/_index.md|include-parents" "index.html,index.xml|content/archive/**/_index.md|include-parents" ) add_if_exists() { local path="$1" [ -e "$path" ] && printf '%s\n' "$path" } add_file() { local path="$1" printf '%s\n' "$path" } file_hash() { if command -v shasum >/dev/null 2>&1; then shasum "$1" | awk '{print $1}' else sha1sum "$1" | awk '{print $1}' fi } file_known_in_manifest() { local file="$1" [ -f "$STATE_MANIFEST" ] || return 1 grep -q "^$file=" "$STATE_MANIFEST" } # ----------------------------------------------------------------------------- # Pattern helpers # ----------------------------------------------------------------------------- pattern_base() { printf '%s\n' "$1" | sed -E 's@[*].*$@@; s@/+$@@' } # returns 0 if file should be skipped because a more specific pattern exists is_shadowed() { local file="$1" local current_base="$2" for other_base in "${PATTERN_BASES[@]}"; do if [ "$other_base" = "$current_base" ]; then continue fi # other_base is more specific than current_base if [[ "$other_base" == "$current_base/"* ]]; then # file belongs to the more specific pattern if [[ "$file" == "$other_base/"* ]]; then debug " skipping shadowed file: $file (shadowed by $other_base)" return 0 fi fi done return 1 } # ----------------------------------------------------------------------------- # Existing helpers (unchanged) # ----------------------------------------------------------------------------- source_file_changed() { local file="$1" [ -f "$file" ] || return 1 local new_hash new_hash=$(file_hash "$file") local old_hash="" if [ -f "$STATE_MANIFEST" ]; then old_hash=$(grep "^$file=" "$STATE_MANIFEST" | cut -d= -f2 || true) fi debug "checking hash for $file" debug " old: ${old_hash:-}" debug " new: $new_hash" [ "$new_hash" != "${old_hash:-}" ] } slugify() { local value value=$(printf '%s' "$1" | iconv -f UTF-8 -t ASCII//TRANSLIT 2>/dev/null || printf '%s' "$1") value=$(printf '%s' "$value" \ | tr '[:upper:]' '[:lower:]' \ | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//; s/-+/-/g') printf '%s\n' "$value" } front_matter() { awk ' NR == 1 && ($0 == "---" || $0 == "+++") { delim=$0; next } delim != "" { if ($0 == delim) exit print } ' "$1" } # ----------------------------------------------------------------------------- # YAML taxonomy extraction (tags / categories) # # The functions `extract_inline_array` and `extract_yaml_list` parse Hugo # front matter using awk in a dependency-free way. # # Supported formats: # # Inline arrays: # tags: [ai, coding] # tags: ["ai, coding"] # # YAML lists: # tags: # - ai # - "ai, coding" # # How it works (short): # # - extract_inline_array # - Detects: key: [...] # - Strips brackets # - Uses a small character-based parser (emit_csv) to split on commas # ONLY outside quotes # - Ensures quoted values like "ai, coding" stay a single item # # - extract_yaml_list # - Detects: key: # - Reads subsequent "- value" lines # - Stops when encountering a non-list line # # - Shared helper functions inside awk: # - emit_csv(line) # → quote-aware splitting # - clean_and_emit(value) # → trims whitespace, removes surrounding quotes, skips empty values # # IMPORTANT: # These helper functions MUST be defined inside each awk script, because # awk functions are NOT shared across separate awk invocations. # # Notes: # - Designed for simple Hugo front matter (not full YAML spec) # - Handles quoted commas correctly # - Fully portable (no external dependencies) # ----------------------------------------------------------------------------- extract_inline_array() { local key="$1" awk -v key="$key" ' function clean_and_emit(value) { gsub(/^[[:space:]]+|[[:space:]]+$/, "", value) gsub(/^["'\''"]|["'\''"]$/, "", value) if (value != "") print value } function emit_csv(line, i, c, value, in_quotes) { value = "" in_quotes = 0 for (i = 1; i <= length(line); i++) { c = substr(line, i, 1) if (c == "\"" || c == "'\''") { in_quotes = !in_quotes continue } if (c == "," && !in_quotes) { clean_and_emit(value) value = "" continue } value = value c } clean_and_emit(value) } BEGIN { collecting = 0 buffer = "" } # detect start of inline array $0 ~ key ":" { line = $0 prefix = key ":" pos = index(line, prefix) if (pos > 0) { line = substr(line, pos + length(prefix)) } gsub(/^[[:space:]]+|[[:space:]]+$/, "", line) if (line ~ /^\[/) { collecting = 1 buffer = line # if already closed on same line → process immediately if (line ~ /\]/) { collecting = 0 process_buffer() } next } } # continue collecting multiline array collecting { buffer = buffer " " $0 if ($0 ~ /\]/) { collecting = 0 process_buffer() } next } function process_buffer( line) { line = buffer # normalize whitespace gsub(/\n/, " ", line) gsub(/[[:space:]]+/, " ", line) # strip brackets sub(/^[[:space:]]*\[/, "", line) sub(/\][[:space:]]*$/, "", line) emit_csv(line) } END { if (collecting) { print "syntax error: unclosed inline array for " key > "/dev/stderr" exit 1 } exit 0 } ' } extract_yaml_list() { local key="$1" awk -v key="$key" ' function clean_and_emit(value) { gsub(/^[[:space:]]+|[[:space:]]+$/, "", value) gsub(/^["'\''"]|["'\''"]$/, "", value) if (value != "") print value } BEGIN { in_list = 0 } $0 ~ "^[[:space:]]*" key ":[[:space:]]*$" { in_list = 1 next } in_list { if ($0 ~ /^[[:space:]]*-[[:space:]]+/) { line = $0 sub(/^[[:space:]]*-[[:space:]]+/, "", line) clean_and_emit(line) next } if ($0 ~ /^[[:space:]]*$/) { next } in_list = 0 } END { exit 0 } ' } extract_taxonomy_terms() { local file="$1" local taxonomy="$2" local fm fm="$(front_matter "$file")" local out_inline out_yaml out_inline=$(printf '%s\n' "$fm" | extract_inline_array "$taxonomy") || return 1 out_yaml=$(printf '%s\n' "$fm" | extract_yaml_list "$taxonomy") || return 1 # remove empty lines printf '%s\n' "$out_inline" "$out_yaml" | awk 'NF' } find_new_posts() { find "$CONTENT_DIR" -mindepth 3 -maxdepth 3 -type d \ -exec test -f "{}/index.md" \; -print \ | sed "s|$CONTENT_DIR/||" \ | sort -r \ | head -n "$CHECK_NEWEST" } post_needs_render() { local rel="$1" local slug slug=$(basename "$rel") local out="$PUBLIC_DIR/posts/$slug/index.html" [ ! -f "$out" ] } add_post_outputs() { local rel="$1" local slug slug=$(basename "$rel") local content_bundle="$CONTENT_DIR/$rel" local public_bundle="$PUBLIC_DIR/posts/$slug" printf '%s\n' "$public_bundle/index.html" find "$content_bundle" -type f ! -name "index.md" | while IFS= read -r asset; do relpath=$(printf '%s\n' "$asset" | sed "s|^$content_bundle/||") printf '%s\n' "$public_bundle/$relpath" done } add_taxonomy_outputs() { local postdir="$1" local content_file="$CONTENT_DIR/$postdir/index.md" if [ ! -f "$content_file" ]; then debug "skipping taxonomy (no index.md): $content_file" return fi debug "reading front matter from $content_file" for taxonomy in tags categories; do extract_taxonomy_terms "$content_file" "$taxonomy" | while IFS= read -r term; do # skip empty terms [ -z "$term" ] && continue debug "processing term: '$term'" slug=$(slugify "$term") || { debug "slugify failed for '$term'" continue } debug "found $taxonomy: '$term' -> $slug" add_file "$PUBLIC_DIR/$taxonomy/$slug/index.html" || true add_file "$PUBLIC_DIR/$taxonomy/$slug/index.xml" || true done done } collect_terms() { local taxonomy="$1" while IFS= read -r rel; do if post_needs_render "$rel"; then local content_file="$CONTENT_DIR/$rel/index.md" debug "collecting $taxonomy from $content_file" if [ -f "$content_file" ]; then extract_taxonomy_terms "$content_file" "$taxonomy" fi fi done < <(find_new_posts) } pattern_search_root() { printf '%s\n' "$1" | sed -E 's@[*].*$@@; s@/+$@@' } pattern_filename() { basename "$1" } find_matching_content_files() { local source="$1" local root local name root=$(pattern_search_root "$source") name=$(pattern_filename "$source") debug " pattern search root: $root" debug " pattern filename: $name" [ -d "$root" ] || return 0 find "$root" -type f -name "$name" } emit_parent_indexes_predicted() { local content_file="$1" local base="$2" shift 2 local outputs=("$@") local dir dir=$(dirname "$content_file") while true; do if [[ "$dir" == "content/$base" ]]; then break fi parent=$(dirname "$dir") if [ -f "$parent/_index.md" ]; then rel="${parent#content/}" for name in "${outputs[@]}"; do out="$PUBLIC_DIR/$rel/$name" debug " emitting parent index: $out" printf '%s\n' "$out" done else debug " skipping parent (no _index.md): $parent" fi dir="$parent" done } # ----------------------------------------------------------------------------- # Build pattern base list # ----------------------------------------------------------------------------- PATTERN_BASES=() for entry in "${EXTRA_OUTPUTS[@]}"; do IFS='|' read -r public source rest <<< "$entry" if [[ "$source" == *"*"* ]]; then base=$(pattern_base "$source") PATTERN_BASES+=("$base") fi done # ----------------------------------------------------------------------------- # Main processing # ----------------------------------------------------------------------------- process_extra_outputs() { local entry public source rest local new_tags new_tags=$(collect_terms tags | sort -u) local new_categories new_categories=$(collect_terms categories | sort -u) for entry in "${EXTRA_OUTPUTS[@]}"; do IFS='|' read -r public source rest <<< "$entry" # SOURCE PATTERN MODE if [[ "$source" == *"*"* ]]; then IFS=',' read -ra outputs <<< "$public" include_parents=0 if [[ "$entry" == *"|include-parents"* ]]; then include_parents=1 fi current_base=$(pattern_base "$source") debug "evaluating content pattern: $source" debug " base path: $current_base" debug " include parents: $include_parents" while IFS= read -r file; do [ -n "$file" ] || continue if is_shadowed "$file" "$current_base"; then continue fi debug " matched content file: $file" rel="${file#content/}" rel="${rel%/_index.md}" needs_emit=0 for name in "${outputs[@]}"; do out="$PUBLIC_DIR/$rel/$name" if file_known_in_manifest "$out"; then debug " skipping known file: $out" else debug " will emit: $out" needs_emit=1 fi done if [ "$needs_emit" -eq 1 ]; then for name in "${outputs[@]}"; do out="$PUBLIC_DIR/$rel/$name" if ! file_known_in_manifest "$out"; then debug " emitting new file: $out" printf '%s\n' "$out" fi done if [ "$include_parents" -eq 1 ]; then emit_parent_indexes_predicted "$file" "${current_base#content/}" "${outputs[@]}" fi fi done < <(find_matching_content_files "$source") continue fi # RULE-BASED MODE emit=0 if source_file_changed "$source"; then emit=1 fi if [ "$emit" -eq 1 ]; then add_if_exists "$PUBLIC_DIR/$public" fi done } update_manifest() { local tmp tmp=$(mktemp) for entry in "${EXTRA_OUTPUTS[@]}"; do IFS='|' read -r public source rest <<< "$entry" if [[ "$source" == *"*"* ]]; then IFS=',' read -ra outputs <<< "$public" while IFS= read -r file; do [ -n "$file" ] || continue rel="${file#content/}" rel="${rel%/_index.md}" for name in "${outputs[@]}"; do out="$PUBLIC_DIR/$rel/$name" hash=$(printf '%s' "$out" | shasum | awk '{print $1}') echo "$out=$hash" >> "$tmp" done done < <(find_matching_content_files "$source") continue fi if [ -f "$source" ]; then hash=$(file_hash "$source") echo "$source=$hash" >> "$tmp" fi done sort -u "$tmp" > "$STATE_MANIFEST" } # ----------------------------------------------------------------------------- # Testing # ----------------------------------------------------------------------------- run_yaml_tests() { echo "[test] running YAML parsing tests..." run_test() { local name="$1" local input="$2" local expected="$3" echo "[test] $name" result=$(printf '%s\n' "$input" | extract_inline_array tags; \ printf '%s\n' "$input" | extract_yaml_list tags) # normalize result result=$(printf '%s\n' "$result" | awk 'NF') if [ "$result" = "$expected" ]; then echo " ✓ PASS" else echo " ✗ FAIL" debug "expected:" debug "$expected" debug "got:" debug "$result" fi } run_test "inline simple" \ 'tags: [ai, coding]' \ $'ai\ncoding' run_test "inline quoted comma" \ 'tags: ["ai, coding"]' \ $'ai, coding' run_test "inline mixed" \ 'tags: ["meta", Hugo]' \ $'meta\nHugo' run_test "yaml list simple" \ $'tags:\n- ai\n- coding' \ $'ai\ncoding' run_test "yaml list quoted" \ $'tags:\n- "ai, coding"\n- Hugo' \ $'ai, coding\nHugo' run_test "empty inline" \ 'tags: []' \ '' run_test "multiline inline" \ "$(cat <<'EOF' tags: [ "string1", "string2", "string3", "string4", "string5", "string6" ] EOF )" \ $'string1\nstring2\nstring3\nstring4\nstring5\nstring6' echo "[test] done" } # Entry point if [ "$TEST_MODE" -eq 1 ]; then run_yaml_tests exit 0 fi { for f in "${GLOBAL_OUTPUTS[@]}"; do add_if_exists "$PUBLIC_DIR/$f" done process_extra_outputs while IFS= read -r rel; do if post_needs_render "$rel"; then add_post_outputs "$rel" add_taxonomy_outputs "$rel" fi done < <(find_new_posts) } | sed 's|^public/||' | sort -u update_manifest