Publications Access Graphs: Difference between revisions

From publications
Line 474: Line 474:
* All plotted points trace back to bucket TSV bins.
* All plotted points trace back to bucket TSV bins.
* /<x> and /<x-dir> variants MUST fold to the same canonical resource.
* /<x> and /<x-dir> variants MUST fold to the same canonical resource.
=Rollup code=
<pre>
root@padme:/home/ralph/AI# cat logrollup
#!/usr/bin/env perl
use strict;
use warnings;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Time::Piece;
use Getopt::Long;
use File::Path qw(make_path);
use File::Spec;
use URI::Escape qw(uri_unescape);
#BEGIN_MWDUMP
#title: CM-bucket-rollup invariants
#format: MWDUMP
#invariants:
#  1. server_name is first-class; never dropped; emitted in output schema and used for optional filtering.
#  2. input globs are expanded then processed in ascending mtime order (oldest -> newest).
#  3. time bucketing is purely mathematical: bucket_start = floor(epoch/period_seconds)*period_seconds.
#  4. badbot is definitive and detected ONLY by HTTP status == 308; no UA regex for badbot.
#  5. AI and bot are derived from /etc/nginx/bots.conf:
#    - only patterns mapping to 0 are "wanted"
#    - between '# good bots' and '# AI bots' => bot
#    - between '# AI bots' and '# unwanted bots' => AI_bot
#    - unwanted-bots section ignored for analytics classification
#  6. output TSV schema is fixed (total/host/path last; totals are derivable):
#      curlwget|ai|bot|human × (get|head|post|put|other) × (ok|redir|client_err|other)
#      badbot_308
#      total_hits server_name path
#  7. Path identity is normalised so the same resource collates across:
#      absolute URLs, query strings (incl action/edit), MediaWiki title=, percent-encoding, and trailing slashes.
#  8. --exclude-local excludes (does not count) local IP hits and POST+edit hits in the defined window, before bucketing.
#  9. web-farm safe: aggregation keys include bucket_start + server_name + path; no cross-vhost contamination.
# 10. bots.conf parsing must be auditable: when --verbose, report "good AI agent" and "good bot" patterns to STDERR.
# 11. method taxonomy is uniform for all agent categories: GET, HEAD, POST, PUT, OTHER (everything else).
#END_MWDUMP
my $cmd = $0;
# -------- options --------
my ($EXCLUDE_LOCAL, $VERBOSE, $HELP, $OUTDIR, $PERIOD, $SERVER) = (0,0,0,".","01:00","");
GetOptions(
    "exclude-local!" => \$EXCLUDE_LOCAL,
    "verbose!"      => \$VERBOSE,
    "help!"          => \$HELP,
    "outdir=s"      => \$OUTDIR,
    "period=s"      => \$PERIOD,
    "server=s"      => \$SERVER,  # optional filter; empty means all
) or usage();
usage() if $HELP;
sub usage {
    print <<"USAGE";
Usage:
  $cmd [options] /var/log/nginx/access.log*
Options:
  --exclude-local  Exclude local IPs and POST edit traffic
  --outdir DIR      Directory to write TSV outputs
  --period HH:MM    Period size (duration), default 01:00
  --server NAME    Only count hits where server_name == NAME (web-farm filter)
  --verbose        Echo processing information + report wanted agents from bots.conf
  --help            Show this help and exit
Output:
  One TSV per time bucket, named:
    YYYY_MM_DDThh_mm-to-YYYY_MM_DDThh_mm.tsv
Columns (server/page last; totals derivable):
  human_head human_get human_post human_other
  ai_head ai_get ai_post ai_other
  bot_head bot_get bot_post bot_other
  badbot_head badbot_get badbot_post badbot_other
  server_name page_category
USAGE
    exit 0;
}
make_path($OUTDIR) unless -d $OUTDIR;
# -------- period math (no validation, per instruction) --------
my ($PH, $PM) = split(/:/, $PERIOD, 2);
my $PERIOD_SECONDS = ($PH * 3600) + ($PM * 60);
# -------- edit exclusion window --------
my $START_EDIT = Time::Piece->strptime("12/Dec/2025:00:00:00", "%d/%b/%Y:%H:%M:%S");
my $END_EDIT  = Time::Piece->strptime("01/Jan/2026:23:59:59", "%d/%b/%Y:%H:%M:%S");
# -------- parse bots.conf (wanted patterns only) --------
my $BOTS_CONF = "/etc/nginx/bots.conf";
my (@AI_REGEX, @BOT_REGEX);
my (@AI_RAW, @BOT_RAW);
open my $bc, "<", $BOTS_CONF or die "$cmd: cannot open $BOTS_CONF: $!";
my $mode = "";
while (<$bc>) {
    if (/^\s*#\s*good bots/i)      { $mode = "GOOD"; next; }
    if (/^\s*#\s*AI bots/i)        { $mode = "AI";  next; }
    if (/^\s*#\s*unwanted bots/i)  { $mode = "";    next; }
    next unless $mode;
    next unless /~\*(.+?)"\s+0;/;
    my $pat = $1;
    if ($mode eq "AI") {
        push @AI_RAW,  $pat;
        push @AI_REGEX, qr/$pat/i;
    } elsif ($mode eq "GOOD") {
        push @BOT_RAW,  $pat;
        push @BOT_REGEX, qr/$pat/i;
    }
}
close $bc;
if ($VERBOSE) {
    for my $p (@AI_RAW)  { print STDERR "[agents] good AI agent: ~*$p\n"; }
    for my $p (@BOT_RAW) { print STDERR "[agents] good bot: ~*$p\n"; }
}
# -------- helpers --------
sub is_local_ip {
    my ($ip) = @_;
    return 1 if $ip eq "127.0.0.1" || $ip eq "::1";
    return 1 if $ip =~ /^10\./;
    return 1 if $ip =~ /^192\.168\./;
    return 0;
}
sub agent_class {
    my ($status, $ua) = @_;
    return "badbot" if $status == 308;
    return "curlwget" if defined($ua) && $ua =~ /\b(?:curl|wget)\b/i;
    for (@AI_REGEX)  { return "ai"  if $ua =~ $_ }
    for (@BOT_REGEX) { return "bot" if $ua =~ $_ }
    return "human";
}
sub method_bucket {
    my ($m) = @_;
    return "head" if $m eq "HEAD";
    return "get"  if $m eq "GET";
    return "post" if $m eq "POST";
    return "put"  if $m eq "PUT";
    return "other";
}
sub status_bucket {
    my ($status) = @_;
    return "other" unless defined($status) && $status =~ /^\d+$/;
    return "ok"        if $status == 200 || $status == 304;
    return "redir"      if $status >= 300 && $status <= 399;  # 308 handled earlier as badbot
    return "client_err" if $status >= 400 && $status <= 499;
    return "other";
}
sub normalise_path {
    my ($raw) = @_;
    my $p = $raw;
    $p =~ s{^https?://[^/]+}{}i;          # strip scheme+host if absolute URL
    $p = "/" if !defined($p) || $p eq "";
    # Split once so we can canonicalise MediaWiki title= before dropping the query.
    my ($base, $qs) = split(/\?/, $p, 2);
    $qs //= "";
    # Rewrite */index.php?title=X* => */X (preserve directory prefix)
    if ($base =~ m{/index\.php$}i && $qs =~ /(?:^|&)title=([^&]+)/i) {
        my $title = uri_unescape($1);
        (my $prefix = $base) =~ s{/index\.php$}{}i;
        $base = $prefix . "/" . $title;
    }
    # Drop query/fragment entirely (normalise out action=edit etc.)
    $p = $base;
    $p =~ s/#.*$//;
    # Percent-decode ONCE
    $p = uri_unescape($p);
    # Collapse multiple slashes
    $p =~ s{//+}{/}g;
    # Trim trailing slash except for root
    $p =~ s{/$}{} if length($p) > 1;
    return $p;
}
sub fmt_ts {
    my ($epoch) = @_;
    my $tp = localtime($epoch);
    return sprintf("%04d_%02d_%02dT%02d_%02d",
        $tp->year, $tp->mon, $tp->mday, $tp->hour, $tp->min);
}
# -------- log regex (captures server_name as final quoted field) --------
my $LOG_RE = qr{
    ^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+
    "(GET|POST|HEAD|[A-Z]+)\s+(\S+)[^"]*"\s+
    (\d+)\s+\d+.*?"[^"]*"\s+"([^"]*)"\s+"([^"]+)"\s*$
}x;
# -------- collect files (glob, then mtime ascending) --------
@ARGV or usage();
my @files;
for my $a (@ARGV) { push @files, glob($a) }
@files = sort { (stat($a))[9] <=> (stat($b))[9] } @files;
# -------- bucketed stats --------
# %BUCKETS{bucket_start}{end} = bucket_end
# %BUCKETS{bucket_start}{stats}{server}{page}{metric} = count
my %BUCKETS;
for my $file (@files) {
    print STDERR "$cmd: processing $file\n" if $VERBOSE;
    my $fh;
    if ($file =~ /\.gz$/) {
        $fh = IO::Uncompress::Gunzip->new($file)
            or die "$cmd: gunzip $file: $GunzipError";
    } else {
        open($fh, "<", $file) or die "$cmd: open $file: $!";
    }
    while (<$fh>) {
        next unless /$LOG_RE/;
        my ($ip,$ts,$method,$path,$status,$ua,$server_name) = ($1,$2,$3,$4,$5,$6,$7);
        next if ($SERVER ne "" && $server_name ne $SERVER);
        my $clean = $ts;
        $clean =~ s/\s+[+-]\d{4}$//;
        my $tp = Time::Piece->strptime($clean, "%d/%b/%Y:%H:%M:%S");
        my $epoch = $tp->epoch;
        if ($EXCLUDE_LOCAL) {
            next if is_local_ip($ip);
            if ($method eq "POST" && $path =~ /edit/i) {
                next if $tp >= $START_EDIT && $tp <= $END_EDIT;
            }
        }
        my $bucket_start = int($epoch / $PERIOD_SECONDS) * $PERIOD_SECONDS;
        my $bucket_end  = $bucket_start + $PERIOD_SECONDS;
        my $npath  = normalise_path($path);
        my $aclass = agent_class($status, $ua);
        my $metric;
        if ($aclass eq "badbot") {
            $metric = "badbot_308";
        } else {
            my $mb = method_bucket($method);
            my $sb = status_bucket($status);
            $metric = join("_", $aclass, $mb, $sb);
        }
        $BUCKETS{$bucket_start}{end} = $bucket_end;
        $BUCKETS{$bucket_start}{stats}{$server_name}{$npath}{$metric}++;
    }
    close $fh;
}
# -------- write outputs --------
my @ACTORS  = qw(curlwget ai bot human);
my @METHODS = qw(get head post put other);
my @SB      = qw(ok redir client_err other);
my @COLS;
for my $a (@ACTORS) {
    for my $m (@METHODS) {
        for my $s (@SB) {
            push @COLS, join("_", $a, $m, $s);
        }
    }
}
push @COLS, "badbot_308";
push @COLS, "total_hits";
push @COLS, "server_name";
push @COLS, "path";
for my $bstart (sort { $a <=> $b } keys %BUCKETS) {
    my $bend = $BUCKETS{$bstart}{end};
    my $out = File::Spec->catfile(
        $OUTDIR,
        fmt_ts($bstart) . "-to-" . fmt_ts($bend) . ".tsv"
    );
    print STDERR "$cmd: writing $out\n" if $VERBOSE;
    open my $outf, ">", $out or die "$cmd: write $out: $!";
    print $outf join("\t", @COLS), "\n";
    my $stats = $BUCKETS{$bstart}{stats};
    for my $srv (sort keys %$stats) {
        for my $p (sort {
                # sort by derived total across all counters (excluding total/host/path)
                my $sa = 0; my $sb = 0;
                for my $c (@COLS) {
                    next if $c eq 'total_hits' || $c eq 'server_name' || $c eq 'path';
                    $sa += ($stats->{$srv}{$a}{$c} // 0);
                    $sb += ($stats->{$srv}{$b}{$c} // 0);
                }
                $sb <=> $sa
            } keys %{ $stats->{$srv} }
        ) {
            my @vals;
            # emit counters
            my $total = 0;
            for my $c (@COLS) {
                if ($c eq 'total_hits') {
                    push @vals, 0; # placeholder; set after computing total
                    next;
                }
                if ($c eq 'server_name') {
                    push @vals, $srv;
                    next;
                }
                if ($c eq 'path') {
                    push @vals, $p;
                    next;
                }
                my $v = $stats->{$srv}{$p}{$c} // 0;
                $total += $v;
                push @vals, $v;
            }
            # patch in total_hits (it is immediately after badbot_308)
            for (my $i = 0; $i < @COLS; $i++) {
                if ($COLS[$i] eq 'total_hits') {
                    $vals[$i] = $total;
                    last;
                }
            }
            print $outf join("\t", @vals), "\n";
        }
    }
    close $outf;
}
</pre>
=categories=

Revision as of 12:50, 30 January 2026

Publications access graphs

  • 2026-01-30 from 2025-12-25: accumulated human get

  • 2026-01-30 from 2025-12-25: page access scatter plot

Corpus Projection Invariants (Normative)

There are two main projections:

  • accumulated human_get times series
  • page_category scatter plot

There are one set of invariants for title normalisation and corpus membership. With the two projections:

  • Corpus membership constraints apply ONLY to the accumulated human_get time series.
  • The daily hits scatter is NOT restricted to corpus titles.
  • Main_Page is excluded from the accumulated human_get projection, but not the scatter plot - this is intentional

Authority and Governance

  • The projections are curator-governed and MUST be reproducible from declared inputs alone.
  • The assisting system MUST NOT infer, rename, paraphrase, merge, split, or reorder titles beyond the explicit rules stated here.
  • The assisting system MUST NOT optimise for visual clarity at the expense of semantic correctness.
  • Any deviation from these invariants MUST be explicitly declared by the human curator with a dated update entry.

Authoritative Inputs

  • Input A: Hourly rollup TSVs produced by logrollup tooling.
  • Input B: Corpus bundle manifest (corpus/manifest.tsv).
  • Input C: Host scope fixed to publications.arising.com.au.
  • Input D: Full temporal range present in the rollup set (no truncation).

Path → Title Extraction

  • A rollup record contributes to a page only if a title can be extracted by these rules:
    • If path matches /pub/<title>, then <title> is the candidate.
    • If path matches /pub-dir/index.php?<query>, the title MUST be taken from title=<title>.
    • If title= is absent, page=<title> MAY be used.
    • Otherwise, the record MUST NOT be treated as a page hit.
  • URL fragments (#…) MUST be removed prior to extraction.

Title Normalisation

  • URL decoding MUST occur before all other steps.
  • Underscores (_) MUST be converted to spaces.
  • UTF-8 dashes (–, —) MUST be converted to ASCII hyphen (-).
  • Whitespace runs MUST be collapsed to a single space and trimmed.
  • After normalisation, the title MUST exactly match a manifest title to remain eligible.
  • Main Page MUST be excluded from this projection.

Wiki Farm Canonicalisation (Mandatory)

  • Each MediaWiki instance in a farm is identified by a (vhost, root) pair.
  • Each instance exposes paired URL forms:
    • /<x>/<Title>
    • /<x-dir>/index.php?title=<Title>
  • For the bound vhost:
    • /<x>/ and /<x-dir>/index.php MUST be treated as equivalent roots.
    • All page hits MUST be folded to a single canonical resource per title.
  • Canonical resource key:
    • (vhost, canonical_title)

Resource Extraction Order (Mandatory)

  1. URL-decode the request path.
  2. Extract title candidate:
    1. If path matches ^/<x>/<title>, extract <title>.
    2. If path matches ^/<x-dir>/index.php?<query>:
      1. Use title=<title> if present.
      2. Else MAY use page=<title> if present.
    3. Otherwise the record is NOT a page resource.
  3. Canonicalise title:
    1. "_" → space
    2. UTF-8 dashes (–, —) → "-"
    3. Collapse whitespace
    4. Trim leading/trailing space
  4. Apply namespace exclusions.
  5. Apply infrastructure exclusions.
  6. Apply canonical folding.
  7. Aggregate.

Infrastructure Exclusions (Mandatory)

Exclude:

  • /
  • /robots.txt
  • Any path containing "sitemap"
  • Any path containing /resources or /resources/
  • /<x-dir>/index.php
  • /<x-dir>/load.php
  • /<x-dir>/api.php
  • /<x-dir>/rest.php/v1/search/title

Exclude static resources by extension:

  • .png .jpg .jpeg .gif .svg .ico .webp


Accumulated human_get time series (projection)

Eligible Resource Set (Corpus Titles)

  • The eligible title set MUST be derived exclusively from corpus/manifest.tsv.
  • Column 1 of manifest.tsv is the authoritative MediaWiki page title.
  • Only titles present in the manifest (after normalisation) are eligible for projection.
  • Titles present in the manifest MUST be included in the projection domain even if they receive zero hits in the period.
  • Titles not present in the manifest MUST be excluded even if traffic exists.

Noise and Infrastructure Exclusions

  • The following MUST be excluded prior to aggregation:
    • Special:, Category:, Category talk:, Talk:, User:, User talk:, File:, Template:, Help:, MediaWiki:
    • /resources/, /pub-dir/load.php, /pub-dir/api.php, /pub-dir/rest.php
    • /robots.txt, /favicon.ico
    • sitemap (any case)
    • Static resources by extension (.png, .jpg, .jpeg, .gif, .svg, .ico, .webp)

Temporal Aggregation

  • Hourly buckets MUST be aggregated into daily totals per title.
  • Accumulated value per title is defined as:
    • cum_hits(title, day_n) = Σ daily_hits(title, day_0 … day_n)
  • Accumulation MUST be monotonic and non-decreasing.

Axis and Scale Invariants

  • X axis: calendar date from earliest to latest available day.
  • Major ticks every 7 days.
  • Minor ticks every day.
  • Date labels MUST be rotated (oblique) for readability.
  • Y axis MUST be logarithmic.
  • Zero or negative values MUST NOT be plotted on the log axis.

Legend Ordering

  • Legend entries MUST be ordered by descending final accumulated human_get_ok.
  • Ordering MUST be deterministic and reproducible.

Visual Disambiguation Invariants

  • Each title MUST be visually distinguishable.
  • The same colour MAY be reused.
  • The same line style MAY be reused.
  • The same (colour + line style) pair MUST NOT be reused.
  • Markers MAY be omitted or reused but MUST NOT be relied upon as the sole distinguishing feature.

Rendering Constraints

  • Legend MUST be placed outside the plot area on the right.
  • Sufficient vertical and horizontal space MUST be reserved to avoid label overlap.
  • Line width SHOULD be consistent across series to avoid implied importance.

Interpretive Constraint

  • This projection indicates reader entry and navigation behaviour only.
  • High lead-in ranking MUST NOT be interpreted as quality, authority, or endorsement.
  • Ordering reflects accumulated human access, not epistemic priority.

Periodic Regeneration

  • This projection is intended to be regenerated periodically.
  • Cross-run comparisons MUST preserve all invariants to allow valid temporal comparison.
  • Changes in lead-in dominance (e.g. Plain-Language Summary vs. CM-1 foundation paper) are observational signals only and do not alter corpus structure.

Metric Definition

  • The only signal used is human_get_ok.
  • non-human classifications MUST NOT be included.
  • No inference from other status codes or agents is permitted.

Corpus Lead-In Projection: Deterministic Colour Map

This table provides the visual encoding for the core corpus pages. For titles not included in the colour map, use colours at your discretion until a Colour Map entry exists.

Colours are drawn from the Matplotlib tab20 palette.

Line styles are assigned to ensure that no (colour + line-style) pair is reused. Legend ordering is governed separately by accumulated human GET_ok.

Corpus Page Title Colour Index Colour (hex) Line Style
Authority Inversion: A Structural Failure in Human-AI Systems 0 #1f77b4 -
Axes of Authority in Stateless Cognitive Systems: Authority Is Not Intelligence 1 #aec7e8 -
CM Capability survey invariants 2 #ff7f0e -
CM-master-1.16 (anchored) 3 #ffbb78 -
Case Study - When the Human Has to Argue With the Machine 4 #2ca02c -
ChatGPT UI Boundary Friction as a Constraint on Round-Trip Knowledge Engineering 5 #98df8a -
Cognitive Memoisation (CM) Public Statement and Stewardship Model 6 #d62728 -
Cognitive Memoisation (CM-2) for Governing Knowledge in Human-AI Collaboration 7 #ff9896 -
Cognitive Memoisation Corpus Map 8 #9467bd -
Cognitive Memoisation Is Not Skynet 9 #c5b0d5 -
Cognitive Memoisation and LLMs: A Method for Exploratory Modelling Before Formalisation 10 #8c564b -
Cognitive Memoisation: LLM Systems Requirements for Knowledge Round Trip Engineering 11 #c49c94 -
Cognitive Memoisation: Plain-Language Summary (For Non-Technical Readers) 12 #e377c2 -
Context is Not Just a Window: Cognitive Memoisation as a Context Architecture for Human-AI Collaboration 13 #f7b6d2 -
Dangling Cognates: Preserving Unresolved Knowledge in Cognitive Memoisation 14 #7f7f7f -
Delegation of Authority to AI Systems: Evidence and Risks 15 #c7c7c7 -
Dimensions of Platform Error: Epistemic Retention Failure in Conversational AI Systems 16 #bcbd22 -
Durability Without Authority: The Missing Governance Layer in Human-AI Collaboration 17 #dbdb8d -
Episodic Failure Case Study: Tied-in-a-Knot Chess Game 18 #17becf -
Externalised Meaning: Making Knowledge Portable Without Ontologies, Vendors or Memory 19 #9edae5 -
First Self-Hosting Epistemic Capture Using Cognitive Memoisation (CM-2) 0 #1f77b4 --
From UI Failure to Logical Entrapment: A Case Study in Post-Hoc Cognitive Memoisation After Exploratory Session Breakdown 1 #aec7e8 --
Governance Failure Axes Taxonomy 2 #ff7f0e --
Governing the Tool That Governs You: A CM-1 Case Study of Authority Inversion in Human-AI Systems 3 #ffbb78 --
Identified Governance Failure Axes: for LLM platforms 4 #2ca02c --
Integrity and Semantic Drift in Large Language Model Systems 5 #98df8a --
Journey: Human-Led Convergence in the Articulation of Cognitive Memoisation 6 #d62728 --
Looping the Loop with No End in Sight: Circular Reasoning Under Stateless Inference Without Governance 7 #ff9896 --
Market Survey: Portability of CM Semantics Across LLM Platforms 8 #9467bd --
Nothing Is Lost: How to Work with AI Without Losing Your Mind 9 #c5b0d5 --
Observed Model Stability: Evidence for Drift-Immune Embedded Governance 10 #8c564b --
Post-Hoc CM Recovery Collapse Under UI Boundary Friction: A Negative Result Case Study 11 #c49c94 --
Progress Without Memory: Cognitive Memoisation as a Knowledge-Engineering Pattern for Stateless LLM Interaction 12 #e377c2 --
Reflexive Development of Cognitive Memoisation: A Round-Trip Cognitive Engineering Case Study 13 #f7b6d2 --
Reflexive Development of Cognitive Memoisation: Dangling Cognates as a First-Class Cognitive Construct 14 #7f7f7f --
What Can Humans Trust LLM AI to Do? 15 #c7c7c7 --
When Evidence Is Not Enough: An Empirical Study of Authority Inversion and Integrity Failure in Conversational AI 16 #bcbd22 --
When Training Overrides Logic: Why Declared Invariants Were Not Enough 17 #dbdb8d --
Why Cognitive Memoisation Is Not Memorization 18 #17becf --
Why Machines Cannot Own Knowledge 19 #9edae5 --
XDUMP as a Minimal Recovery Mechanism for Round-Trip Knowledge Engineering Under Governance Situated Inference Loss 0 #1f77b4 -.

Corpus Lead-In Projection: Colour-Map Hardening Invariants

This section hardens the visual determinism of the Corpus Lead-In Projection while allowing controlled corpus growth.

Authority

  • This Colour Map is **authoritative** for all listed corpus pages.
  • The assisting system MUST NOT invent, alter, or substitute colours or line styles for listed pages.
  • Visual encoding is a governed property, not a presentation choice.

Binding Rule

  • For any page listed in the Deterministic Colour Map table:
    • The assigned (colour index, colour hex, line style) pair MUST be used exactly.
    • Deviation constitutes a projection violation.

Legend Ordering Separation

  • Colour assignment and legend ordering are orthogonal.
  • Legend ordering MUST continue to follow the accumulated human GET_ok invariant.
  • Colour assignment MUST NOT be influenced by hit counts, rank, or ordering.

New Page Admission Rule

  • Pages not present in the current Colour Map MUST appear in a projection.
  • New pages MUST be assigned styles in strict sequence order:
    • Iterate line style first, then colour index, exactly as defined in the base palette.
    • Previously assigned pairs MUST NOT be reused.
  • The assisting system MUST NOT reshuffle existing assignments to “make space”.

Provisional Encoding Rule

  • Visual assignments for newly admitted pages are **provisional** until recorded.
  • A projection that introduces provisional encodings MUST:
    • Emit a warning note in the run metadata, and
    • Produce an updated Colour Map table for curator review.

Curator Ratification

  • Only the human curator may ratify new colour assignments.
  • Ratification occurs by appending new rows to the Colour Map table with a date stamp.
  • Once ratified, assignments become binding for all future projections.

Backward Compatibility

  • Previously generated projections remain valid historical artefacts.
  • Introduction of new pages MUST NOT retroactively alter the appearance of older projections.

Failure Mode Detection

  • If a projection requires more unique (colour, line-style) pairs than the declared palette provides:
    • The assisting system MUST fail explicitly.
    • Silent reuse, substitution, or visual approximation is prohibited.

Rationale (Non-Normative)

  • This hardening ensures:
    • Cross-run visual comparability
    • Human recognition of lead-in stability
    • Detectable drift when corpus structure changes
  • Visual determinism is treated as part of epistemic governance, not aesthetics.

daily hits scatter (projections)

purpose:

  • Produce deterministic, human-meaningful MediaWiki page-level analytics from nginx bucket TSVs,
  • folding all URL variants to canonical resources and rendering a scatter projection across agents, HTTP methods, and outcomes.

Authority

  • These invariants are normative.
  • The assisting system MUST follow them exactly.
  • Visual encoding is a governed semantic property, not a presentation choice.

Inputs

  • Bucket TSVs produced by page_hits_bucketfarm_methods.pl
  • Required columns:
    • server_name
    • path (or page_category)
    • <agent>_<METHOD>_<outcome> numeric bins
  • Other columns MAY exist and MUST be ignored unless explicitly referenced.

Scope

  • Projection MUST bind to exactly ONE nginx virtual host at a time.
  • Example: publications.arising.com.au
  • Cross-vhost aggregation is prohibited.

Namespace Exclusions (Mandatory)

Exclude titles with case-insensitive prefix:

  • Special:
  • Category:
  • Category talk:
  • Talk:
  • User:
  • User talk:
  • File:
  • Template:
  • Help:
  • MediaWiki:
  • Obvious misspellings (e.g. Catgeory:) SHOULD be excluded.

Bad Bot Hits

  • Bad bot hits MUST be included for any canonical page resource that survives normalisation and exclusion.
  • Bad bot traffic MUST NOT be excluded solely by agent class; it MAY only be excluded if the request is excluded by namespace or infrastructure rules.
  • badbot_308 SHALL be treated as badbot_GET_redir for scatter projections.
  • Human success ordering (HUMAN_200_304) remains the sole ordering metric; inclusion of badbot hits MUST NOT affect ranking.

Aggregation Invariant (Mandatory)

  • Aggregate across ALL rollup buckets in the selected time span.
  • GROUP BY canonical resource.
  • SUM all numeric <agent>_<METHOD>_<outcome> bins.
  • Each canonical resource MUST appear exactly once.

Human Success Spine (Mandatory)

  • Define ordering metric:
    • HUMAN_200_304 := human_GET_ok + human_GET_redir
  • This metric is used ONLY for vertical ordering.

Ranking and Selection

  • Sort resources by HUMAN_200_304 descending.
  • Select Top-N resources (default N = 50).
  • Any non-default N MUST be declared in run metadata.

Rendering Invariants (Scatter Plot)

Axes

  • X axis MUST be logarithmic.
  • X axis MUST include log-paper verticals:
    • Major: 10^k
    • Minor: 2..9 × 10^k
  • Y axis lists canonical resources ordered by HUMAN_200_304.

Baseline Alignment

  • The resource label baseline MUST align with human GET_ok.
  • human_GET_ok points MUST have vertical offset = 0.
  • Draw faint horizontal baseline guides for each resource row.

Category Plotting

  • ALL agent/method/outcome bins present MUST be plotted.
  • No category elision, suppression, or collapsing is permitted.

Intra-Row Separation

  • Apply deterministic vertical offsets per METHOD_outcome key.
  • Offsets MUST be stable and deterministic.
  • human_GET_ok is exempt (offset = 0).

Redirect Jitter

  • human_GET_redir MUST receive a small fixed positive offset (e.g. +0.35).
  • Random jitter is prohibited.

Encoding Invariants

Agent Encoding

  • Agent encoded by colour.
  • badbot MUST be red.

Method Encoding

  • GET → o
  • POST → ^
  • PUT → v
  • HEAD → D
  • OTHER → .

Outcome Overlay

  • ok → no overlay
  • redir → diagonal slash (/)
  • client_err → x
  • server_err → x
  • other/unknown → +

Legend Invariants

  • Legend MUST be present.
  • Legend title MUST be exactly: Legend
  • Legend MUST explain:
    • Agent colours
    • Method shapes
    • Outcome overlays
  • Legend MUST NOT overlap resource labels.
  • The legend MUST be labeled as 'legend' only

Legend Presence (Mandatory)

  • A legend MUST be rendered on every scatter plot output.
  • The legend title MUST be exactly: Legend
  • A projection without a legend is non-compliant.

Legend Content (Mandatory; Faithful to Encoding Invariants)

The legend MUST include three components:

  1. Agent key (colour):
    1. human
    2. ai
    3. bot
    4. curlwget
    5. badbot (MUST be red)
  2. Method key (base shapes):
    1. GET → o
    2. POST → ^
    3. PUT → v
    4. HEAD → D
    5. OTHER → .
  3. Outcome overlay key:
    1. x = error (client_err or server_err)
    2. / = redir
    3. + = other (other or unknown)
    4. none = ok

Legend Placement (Mandatory)

  • The legend MUST be placed INSIDE the plotting area.
  • The legend location MUST be bottom-right (axis-anchored):
    • loc = lower right
  • The legend MUST NOT be placed outside the plot area (no RHS external legend).
  • The legend MUST NOT overlap the y-axis labels (resource labels).
  • The legend MUST be fully visible and non-clipped in the output image.

Legend Rendering Constraints (Mandatory)

  • The legend MUST use a frame (boxed) to preserve readability over gridlines/points.
  • The legend frame SHOULD use partial opacity to avoid obscuring data:
    • frame alpha SHOULD be approximately 0.85 (fixed, deterministic).
  • Legend ordering MUST be deterministic (fixed order):
    • Agents: human, ai, bot, curlwget, badbot
    • Methods: GET, POST, PUT, HEAD, OTHER
    • Outcomes: x=error, /=redir, +=other, none=ok

Validation

A compliant scatter output SHALL satisfy:

  • Legend is present.
  • Legend title equals "Legend".
  • Legend is inside plot bottom-right.
  • Legend is non-clipped.
  • Legend contains agent, method, and outcome keys as specified.

Determinism

  • No random jitter.
  • No data-dependent styling.
  • Identical inputs MUST produce identical outputs.

Validation Requirements

  • No duplicate logical pages after canonical folding.
  • HUMAN_200_304 ordering is monotonic.
  • All plotted points trace back to bucket TSV bins.
  • /<x> and /<x-dir> variants MUST fold to the same canonical resource.

Rollup code

root@padme:/home/ralph/AI# cat logrollup
#!/usr/bin/env perl
use strict;
use warnings;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Time::Piece;
use Getopt::Long;
use File::Path qw(make_path);
use File::Spec;
use URI::Escape qw(uri_unescape);

#BEGIN_MWDUMP
#title: CM-bucket-rollup invariants
#format: MWDUMP
#invariants:
#  1. server_name is first-class; never dropped; emitted in output schema and used for optional filtering.
#  2. input globs are expanded then processed in ascending mtime order (oldest -> newest).
#  3. time bucketing is purely mathematical: bucket_start = floor(epoch/period_seconds)*period_seconds.
#  4. badbot is definitive and detected ONLY by HTTP status == 308; no UA regex for badbot.
#  5. AI and bot are derived from /etc/nginx/bots.conf:
#     - only patterns mapping to 0 are "wanted"
#     - between '# good bots' and '# AI bots' => bot
#     - between '# AI bots' and '# unwanted bots' => AI_bot
#     - unwanted-bots section ignored for analytics classification
#  6. output TSV schema is fixed (total/host/path last; totals are derivable):
#       curlwget|ai|bot|human × (get|head|post|put|other) × (ok|redir|client_err|other)
#       badbot_308
#       total_hits server_name path
#  7. Path identity is normalised so the same resource collates across:
#       absolute URLs, query strings (incl action/edit), MediaWiki title=, percent-encoding, and trailing slashes.
#  8. --exclude-local excludes (does not count) local IP hits and POST+edit hits in the defined window, before bucketing.
#  9. web-farm safe: aggregation keys include bucket_start + server_name + path; no cross-vhost contamination.
# 10. bots.conf parsing must be auditable: when --verbose, report "good AI agent" and "good bot" patterns to STDERR.
# 11. method taxonomy is uniform for all agent categories: GET, HEAD, POST, PUT, OTHER (everything else).
#END_MWDUMP

my $cmd = $0;

# -------- options --------
my ($EXCLUDE_LOCAL, $VERBOSE, $HELP, $OUTDIR, $PERIOD, $SERVER) = (0,0,0,".","01:00","");

GetOptions(
    "exclude-local!" => \$EXCLUDE_LOCAL,
    "verbose!"       => \$VERBOSE,
    "help!"          => \$HELP,
    "outdir=s"       => \$OUTDIR,
    "period=s"       => \$PERIOD,
    "server=s"       => \$SERVER,   # optional filter; empty means all
) or usage();
usage() if $HELP;

sub usage {
    print <<"USAGE";
Usage:
  $cmd [options] /var/log/nginx/access.log*

Options:
  --exclude-local   Exclude local IPs and POST edit traffic
  --outdir DIR      Directory to write TSV outputs
  --period HH:MM    Period size (duration), default 01:00
  --server NAME     Only count hits where server_name == NAME (web-farm filter)
  --verbose         Echo processing information + report wanted agents from bots.conf
  --help            Show this help and exit

Output:
  One TSV per time bucket, named:
    YYYY_MM_DDThh_mm-to-YYYY_MM_DDThh_mm.tsv

Columns (server/page last; totals derivable):
  human_head human_get human_post human_other
  ai_head ai_get ai_post ai_other
  bot_head bot_get bot_post bot_other
  badbot_head badbot_get badbot_post badbot_other
  server_name page_category
USAGE
    exit 0;
}

make_path($OUTDIR) unless -d $OUTDIR;

# -------- period math (no validation, per instruction) --------
my ($PH, $PM) = split(/:/, $PERIOD, 2);
my $PERIOD_SECONDS = ($PH * 3600) + ($PM * 60);

# -------- edit exclusion window --------
my $START_EDIT = Time::Piece->strptime("12/Dec/2025:00:00:00", "%d/%b/%Y:%H:%M:%S");
my $END_EDIT   = Time::Piece->strptime("01/Jan/2026:23:59:59", "%d/%b/%Y:%H:%M:%S");

# -------- parse bots.conf (wanted patterns only) --------
my $BOTS_CONF = "/etc/nginx/bots.conf";
my (@AI_REGEX, @BOT_REGEX);
my (@AI_RAW, @BOT_RAW);

open my $bc, "<", $BOTS_CONF or die "$cmd: cannot open $BOTS_CONF: $!";
my $mode = "";
while (<$bc>) {
    if (/^\s*#\s*good bots/i)      { $mode = "GOOD"; next; }
    if (/^\s*#\s*AI bots/i)        { $mode = "AI";   next; }
    if (/^\s*#\s*unwanted bots/i)  { $mode = "";     next; }

    next unless $mode;
    next unless /~\*(.+?)"\s+0;/;
    my $pat = $1;

    if ($mode eq "AI") {
        push @AI_RAW,  $pat;
        push @AI_REGEX, qr/$pat/i;
    } elsif ($mode eq "GOOD") {
        push @BOT_RAW,  $pat;
        push @BOT_REGEX, qr/$pat/i;
    }
}
close $bc;

if ($VERBOSE) {
    for my $p (@AI_RAW)  { print STDERR "[agents] good AI agent: ~*$p\n"; }
    for my $p (@BOT_RAW) { print STDERR "[agents] good bot: ~*$p\n"; }
}

# -------- helpers --------
sub is_local_ip {
    my ($ip) = @_;
    return 1 if $ip eq "127.0.0.1" || $ip eq "::1";
    return 1 if $ip =~ /^10\./;
    return 1 if $ip =~ /^192\.168\./;
    return 0;
}

sub agent_class {
    my ($status, $ua) = @_;
    return "badbot" if $status == 308;
    return "curlwget" if defined($ua) && $ua =~ /\b(?:curl|wget)\b/i;
    for (@AI_REGEX)  { return "ai"  if $ua =~ $_ }
    for (@BOT_REGEX) { return "bot" if $ua =~ $_ }
    return "human";
}

sub method_bucket {
    my ($m) = @_;
    return "head" if $m eq "HEAD";
    return "get"  if $m eq "GET";
    return "post" if $m eq "POST";
    return "put"  if $m eq "PUT";
    return "other";
}

sub status_bucket {
    my ($status) = @_;
    return "other" unless defined($status) && $status =~ /^\d+$/;
    return "ok"         if $status == 200 || $status == 304;
    return "redir"      if $status >= 300 && $status <= 399;  # 308 handled earlier as badbot
    return "client_err" if $status >= 400 && $status <= 499;
    return "other";
}

sub normalise_path {
    my ($raw) = @_;

    my $p = $raw;
    $p =~ s{^https?://[^/]+}{}i;          # strip scheme+host if absolute URL
    $p = "/" if !defined($p) || $p eq "";

    # Split once so we can canonicalise MediaWiki title= before dropping the query.
    my ($base, $qs) = split(/\?/, $p, 2);
    $qs //= "";

    # Rewrite */index.php?title=X* => */X (preserve directory prefix)
    if ($base =~ m{/index\.php$}i && $qs =~ /(?:^|&)title=([^&]+)/i) {
        my $title = uri_unescape($1);
        (my $prefix = $base) =~ s{/index\.php$}{}i;
        $base = $prefix . "/" . $title;
    }

    # Drop query/fragment entirely (normalise out action=edit etc.)
    $p = $base;
    $p =~ s/#.*$//;

    # Percent-decode ONCE
    $p = uri_unescape($p);

    # Collapse multiple slashes
    $p =~ s{//+}{/}g;

    # Trim trailing slash except for root
    $p =~ s{/$}{} if length($p) > 1;

    return $p;
}

sub fmt_ts {
    my ($epoch) = @_;
    my $tp = localtime($epoch);
    return sprintf("%04d_%02d_%02dT%02d_%02d",
        $tp->year, $tp->mon, $tp->mday, $tp->hour, $tp->min);
}

# -------- log regex (captures server_name as final quoted field) --------
my $LOG_RE = qr{
    ^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+
    "(GET|POST|HEAD|[A-Z]+)\s+(\S+)[^"]*"\s+
    (\d+)\s+\d+.*?"[^"]*"\s+"([^"]*)"\s+"([^"]+)"\s*$
}x;

# -------- collect files (glob, then mtime ascending) --------
@ARGV or usage();
my @files;
for my $a (@ARGV) { push @files, glob($a) }
@files = sort { (stat($a))[9] <=> (stat($b))[9] } @files;

# -------- bucketed stats --------
# %BUCKETS{bucket_start}{end} = bucket_end
# %BUCKETS{bucket_start}{stats}{server}{page}{metric} = count
my %BUCKETS;

for my $file (@files) {
    print STDERR "$cmd: processing $file\n" if $VERBOSE;

    my $fh;
    if ($file =~ /\.gz$/) {
        $fh = IO::Uncompress::Gunzip->new($file)
            or die "$cmd: gunzip $file: $GunzipError";
    } else {
        open($fh, "<", $file) or die "$cmd: open $file: $!";
    }

    while (<$fh>) {
        next unless /$LOG_RE/;
        my ($ip,$ts,$method,$path,$status,$ua,$server_name) = ($1,$2,$3,$4,$5,$6,$7);

        next if ($SERVER ne "" && $server_name ne $SERVER);

        my $clean = $ts;
        $clean =~ s/\s+[+-]\d{4}$//;
        my $tp = Time::Piece->strptime($clean, "%d/%b/%Y:%H:%M:%S");
        my $epoch = $tp->epoch;

        if ($EXCLUDE_LOCAL) {
            next if is_local_ip($ip);
            if ($method eq "POST" && $path =~ /edit/i) {
                next if $tp >= $START_EDIT && $tp <= $END_EDIT;
            }
        }

        my $bucket_start = int($epoch / $PERIOD_SECONDS) * $PERIOD_SECONDS;
        my $bucket_end   = $bucket_start + $PERIOD_SECONDS;

        my $npath  = normalise_path($path);
        my $aclass = agent_class($status, $ua);

        my $metric;
        if ($aclass eq "badbot") {
            $metric = "badbot_308";
        } else {
            my $mb = method_bucket($method);
            my $sb = status_bucket($status);
            $metric = join("_", $aclass, $mb, $sb);
        }

        $BUCKETS{$bucket_start}{end} = $bucket_end;
        $BUCKETS{$bucket_start}{stats}{$server_name}{$npath}{$metric}++;
    }
    close $fh;
}

# -------- write outputs --------
my @ACTORS  = qw(curlwget ai bot human);
my @METHODS = qw(get head post put other);
my @SB      = qw(ok redir client_err other);

my @COLS;
for my $a (@ACTORS) {
    for my $m (@METHODS) {
        for my $s (@SB) {
            push @COLS, join("_", $a, $m, $s);
        }
    }
}
push @COLS, "badbot_308";
push @COLS, "total_hits";
push @COLS, "server_name";
push @COLS, "path";

for my $bstart (sort { $a <=> $b } keys %BUCKETS) {
    my $bend = $BUCKETS{$bstart}{end};
    my $out = File::Spec->catfile(
        $OUTDIR,
        fmt_ts($bstart) . "-to-" . fmt_ts($bend) . ".tsv"
    );

    print STDERR "$cmd: writing $out\n" if $VERBOSE;

    open my $outf, ">", $out or die "$cmd: write $out: $!";
    print $outf join("\t", @COLS), "\n";

    my $stats = $BUCKETS{$bstart}{stats};

    for my $srv (sort keys %$stats) {
        for my $p (sort {
                # sort by derived total across all counters (excluding total/host/path)
                my $sa = 0; my $sb = 0;
                for my $c (@COLS) {
                    next if $c eq 'total_hits' || $c eq 'server_name' || $c eq 'path';
                    $sa += ($stats->{$srv}{$a}{$c} // 0);
                    $sb += ($stats->{$srv}{$b}{$c} // 0);
                }
                $sb <=> $sa
            } keys %{ $stats->{$srv} }
        ) {
            my @vals;

            # emit counters
            my $total = 0;
            for my $c (@COLS) {
                if ($c eq 'total_hits') {
                    push @vals, 0; # placeholder; set after computing total
                    next;
                }
                if ($c eq 'server_name') {
                    push @vals, $srv;
                    next;
                }
                if ($c eq 'path') {
                    push @vals, $p;
                    next;
                }

                my $v = $stats->{$srv}{$p}{$c} // 0;
                $total += $v;
                push @vals, $v;
            }

            # patch in total_hits (it is immediately after badbot_308)
            for (my $i = 0; $i < @COLS; $i++) {
                if ($COLS[$i] eq 'total_hits') {
                    $vals[$i] = $total;
                    last;
                }
            }

            print $outf join("\t", @vals), "\n";
        }
    }
    close $outf;
}

categories