quietlight/skraak_mcp: shell_scripts/verify_cluster

#!/bin/bash

# Verification script for bulk import cluster assignment fix
# Tests that files are correctly distributed when same location has multiple date ranges
# Usage: ./verify_cluster_fix.sh [db_path]
# Default: ../db/test.duckdb

# Get absolute paths before changing directory
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
DB_PATH="${1:-$PROJECT_DIR/db/test.duckdb}"

if [ ! -f "$DB_PATH" ]; then
    echo "Error: Database not found at $DB_PATH"
    exit 1
fi

echo "======================================================================"
echo "Bulk Import Cluster Assignment Fix Verification"
echo "======================================================================"
echo "Database: $DB_PATH"
echo ""
echo "This test verifies that when the same location appears multiple times"
echo "in the CSV with different date ranges, files are distributed correctly"
echo "across their respective clusters (not all going to the last cluster)."
echo ""

# Navigate to the project directory
cd "$PROJECT_DIR" || exit 1

if [ ! -f "./skraak_mcp" ]; then
    echo "Error: skraak_mcp binary not found. Run 'go build' first."
    exit 1
fi

# Function to send MCP request
send_request() {
    local method="$1"
    local params="$2"

    (
        echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}'
        sleep 0.2
        echo "{\"jsonrpc\":\"2.0\",\"id\":2,\"method\":\"$method\",\"params\":$params}"
        sleep 0.5
    ) | timeout 10 ./skraak_mcp "$DB_PATH" 2>&1 | grep '"id":2' | head -1
}

echo "Step 1: Create test dataset and location"
echo "========================================="

# Create a test dataset
echo -n "Creating test dataset... "
DATASET_RESULT=$(send_request "tools/call" '{"name":"create_or_update_dataset","arguments":{"name":"Cluster Fix Verification","type":"test","description":"Testing cluster assignment bug fix"}}')
DATASET_ID=$(echo "$DATASET_RESULT" | jq -r '.result.structuredContent.dataset.id // empty')
if [ -n "$DATASET_ID" ]; then
    echo "✓ Created: $DATASET_ID"
else
    echo "✗ Failed"
    exit 1
fi

# Create ONE test location (same location will be used multiple times)
echo -n "Creating test location... "
LOCATION_RESULT=$(send_request "tools/call" '{"name":"create_or_update_location","arguments":{"dataset_id":"'"$DATASET_ID"'","name":"Multi-Year Recording Site","latitude":-41.2865,"longitude":174.7762,"timezone_id":"Pacific/Auckland","description":"Site with recordings from multiple years"}}')
LOCATION_ID=$(echo "$LOCATION_RESULT" | jq -r '.result.structuredContent.location.id // empty')
if [ -n "$LOCATION_ID" ]; then
    echo "✓ Created: $LOCATION_ID"
else
    echo "✗ Failed"
    exit 1
fi
echo ""

echo "Step 2: Create CSV with SAME LOCATION, DIFFERENT DATE RANGES"
echo "=============================================================="

CSV_FILE="/tmp/verify_cluster_fix_$$.csv"
LOG_FILE="/tmp/verify_cluster_fix_$$.log"

# CSV with same location ID appearing 4 times with different date ranges
# This is the exact scenario that triggered the bug
cat > "$CSV_FILE" << EOF
location_name,location_id,directory_path,date_range,sample_rate,file_count
Multi-Year Recording Site,$LOCATION_ID,/nonexistent/2019,2019,8000,100
Multi-Year Recording Site,$LOCATION_ID,/nonexistent/2020,2020,8000,200
Multi-Year Recording Site,$LOCATION_ID,/nonexistent/2022,2022,8000,300
Multi-Year Recording Site,$LOCATION_ID,/nonexistent/2024,2024,8000,400
EOF

echo "CSV Contents:"
cat "$CSV_FILE"
echo ""
echo "KEY OBSERVATION: Same location_id ($LOCATION_ID) appears 4 times"
echo "                 with different date_range values (2019, 2020, 2022, 2024)"
echo ""

echo "Step 3: Run bulk import (expect cluster creation only, no files)"
echo "=================================================================="

echo "Calling bulk_file_import..."
IMPORT_RESULT=$(send_request "tools/call" "{\"name\":\"bulk_file_import\",\"arguments\":{\"dataset_id\":\"$DATASET_ID\",\"csv_path\":\"$CSV_FILE\",\"log_file_path\":\"$LOG_FILE\"}}")

# Extract results
CLUSTERS_CREATED=$(echo "$IMPORT_RESULT" | jq -r '.result.structuredContent.clusters_created // 0')
CLUSTERS_EXISTING=$(echo "$IMPORT_RESULT" | jq -r '.result.structuredContent.clusters_existing // 0')
TOTAL_CLUSTERS=$((CLUSTERS_CREATED + CLUSTERS_EXISTING))

echo ""
echo "Import Results:"
echo "  Clusters created: $CLUSTERS_CREATED"
echo "  Clusters existing: $CLUSTERS_EXISTING"
echo "  Total clusters: $TOTAL_CLUSTERS"
echo ""

if [ "$TOTAL_CLUSTERS" -eq 4 ]; then
    echo "✓ PASS: 4 clusters created/found (one per date range)"
else
    echo "✗ FAIL: Expected 4 clusters, got $TOTAL_CLUSTERS"
    exit 1
fi

echo "Step 4: Verify cluster names in database"
echo "=========================================="

# Query clusters for this location
echo "Querying database for clusters..."
SQL_QUERY="SELECT name FROM cluster WHERE location_id = '$LOCATION_ID' AND active = true ORDER BY name"
QUERY_RESULT=$(send_request "tools/call" "{\"name\":\"execute_sql\",\"arguments\":{\"query\":\"$SQL_QUERY\"}}")

CLUSTER_NAMES=$(echo "$QUERY_RESULT" | jq -r '.result.structuredContent.rows[] | .[0]' 2>/dev/null)

echo ""
echo "Clusters found in database:"
echo "$CLUSTER_NAMES" | while read -r name; do
    echo "  - $name"
done
echo ""

# Count clusters
CLUSTER_COUNT=$(echo "$CLUSTER_NAMES" | grep -c .)
if [ "$CLUSTER_COUNT" -eq 4 ]; then
    echo "✓ PASS: 4 distinct clusters in database"
else
    echo "✗ FAIL: Expected 4 clusters, found $CLUSTER_COUNT"
fi

# Verify expected names
echo ""
echo "Verifying expected cluster names..."
for year in 2019 2020 2022 2024; do
    if echo "$CLUSTER_NAMES" | grep -q "^$year$"; then
        echo "  ✓ Found cluster: $year"
    else
        echo "  ✗ Missing cluster: $year"
    fi
done

echo ""
echo "Step 5: Check log file"
echo "======================"

if [ -f "$LOG_FILE" ]; then
    echo "✓ Log file created"
    echo ""
    echo "Log excerpt (cluster creation):"
    grep -E "(Creating|Using existing) cluster:" "$LOG_FILE" | head -4
else
    echo "ℹ Log file not created"
fi

echo ""
echo "======================================================================"
echo "VERIFICATION COMPLETE"
echo "======================================================================"
echo ""
echo "Expected behavior (AFTER fix):"
echo "  ✓ 4 clusters created (2019, 2020, 2022, 2024)"
echo "  ✓ Each cluster has unique name matching date_range"
echo "  ✓ Files (if present) would be distributed to correct clusters"
echo ""
echo "Bug behavior (BEFORE fix):"
echo "  ✗ 4 clusters created BUT clusterIDMap[location_id] overwrites"
echo "  ✗ Only last cluster (2024) stored in map"
echo "  ✗ ALL files would go to 2024 cluster"
echo ""
echo "This test validates that the composite key fix works correctly."
echo "The map now uses 'locationID|dateRange' preventing overwrites."
echo ""

# Cleanup
rm -f "$CSV_FILE" "$LOG_FILE"
echo "Cleanup complete."