quietlight/skraakCLI: tools/calls_detect

package tools

import (
	"fmt"
	"os"
	"path/filepath"

	"skraak/utils"
)

type DetectAnomaliesInput struct {
	Folder  string
	Models  []string // at least 2 filter names
	Species []string // optional scope; empty = all species
}

type DetectAnomaliesOutput struct {
	Folder              string    `json:"folder"`
	Models              []string  `json:"models"`
	FilesExamined       int       `json:"files_examined"`
	FilesWithAllModels  int       `json:"files_with_all_models"`
	AnomaliesTotal      int       `json:"anomalies_total"`
	LabelMismatches     int       `json:"label_mismatches"`
	CertaintyMismatches int       `json:"certainty_mismatches"`
	Anomalies           []Anomaly `json:"anomalies,omitempty"`
	Error               string    `json:"error,omitempty"`
}

type Anomaly struct {
	File     string           `json:"file"`
	Type     string           `json:"type"` // "label_mismatch" | "certainty_mismatch"
	Segments []AnomalySegment `json:"segments"`
}

type AnomalySegment struct {
	Model     string  `json:"model"`
	Start     float64 `json:"start"`
	End       float64 `json:"end"`
	Species   string  `json:"species"`
	CallType  string  `json:"calltype,omitempty"`
	Certainty int     `json:"certainty"`
}

// DetectAnomalies compares corresponding segments across multiple ML model filters
// within each .data file. Segments are matched by time overlap (same logic as propagate).
// Lonely segments (no overlap in one or more models) are silently skipped.
// Anomalies are flagged when overlapping segments disagree on species+calltype,
// or when labels match but certainty values differ.
// validateAnomalyInput validates the input parameters for DetectAnomalies.
func validateAnomalyInput(input DetectAnomaliesInput) error {
	if len(input.Models) < 2 {
		return fmt.Errorf("at least 2 --model values required")
	}
	for i, a := range input.Models {
		for j, b := range input.Models {
			if i != j && a == b {
				return fmt.Errorf("duplicate --model values are not allowed")
			}
		}
	}

	info, err := os.Stat(input.Folder)
	if err != nil {
		return fmt.Errorf("folder not found: %s", input.Folder)
	}
	if !info.IsDir() {
		return fmt.Errorf("not a directory: %s", input.Folder)
	}

	return nil
}

func DetectAnomalies(input DetectAnomaliesInput) (DetectAnomaliesOutput, error) {
	folder := filepath.Clean(input.Folder)
	output := DetectAnomaliesOutput{
		Folder: folder,
		Models: input.Models,
	}

	if err := validateAnomalyInput(input); err != nil {
		output.Error = err.Error()
		return output, err
	}

	files, err := utils.FindDataFiles(folder)
	if err != nil {
		output.Error = fmt.Sprintf("list .data files: %v", err)
		return output, fmt.Errorf("%s", output.Error)
	}

	scopeSet := make(map[string]bool, len(input.Species))
	for _, s := range input.Species {
		scopeSet[s] = true
	}

	for _, path := range files {
		df, err := utils.ParseDataFile(path)
		if err != nil {
			continue
		}
		output.FilesExamined++

		anomalies := detectAnomaliesInFile(df, path, input.Models, scopeSet)
		if anomalies == nil {
			// file didn't have all models present
			continue
		}
		output.FilesWithAllModels++
		for _, a := range anomalies {
			if a.Type == "label_mismatch" {
				output.LabelMismatches++
			} else {
				output.CertaintyMismatches++
			}
		}
		output.Anomalies = append(output.Anomalies, anomalies...)
	}
	output.AnomaliesTotal = len(output.Anomalies)
	return output, nil
}

// labeledSeg pairs a segment with the specific label matching the model filter.
type labeledSeg struct {
	seg   *utils.Segment
	label *utils.Label
}

// detectAnomaliesInFile returns nil if the file doesn't contain all required models.
func detectAnomaliesInFile(df *utils.DataFile, path string, models []string, scope map[string]bool) []Anomaly {
	modelSegs := collectModelSegments(df, models)

	// Skip file if any model is entirely absent.
	for _, model := range models {
		if len(modelSegs[model]) == 0 {
			return nil
		}
	}

	var anomalies []Anomaly
	for _, anchor := range modelSegs[models[0]] {
		if !inScope(anchor, scope) {
			continue
		}
		if matches := findOverlappingMatches(anchor, models, modelSegs); matches == nil {
			continue
		} else {
			group := buildComparisonGroup(anchor, models, matches)
			if a := checkGroupAnomaly(group, path, models); a != nil {
				anomalies = append(anomalies, *a)
			}
		}
	}
	return anomalies
}

// collectModelSegments groups labeled segments by model filter name.
func collectModelSegments(df *utils.DataFile, models []string) map[string][]labeledSeg {
	modelSegs := make(map[string][]labeledSeg, len(models))
	for _, seg := range df.Segments {
		for _, lbl := range seg.Labels {
			for _, model := range models {
				if lbl.Filter == model {
					modelSegs[model] = append(modelSegs[model], labeledSeg{seg: seg, label: lbl})
					break
				}
			}
		}
	}
	return modelSegs
}

// inScope returns true if the anchor's label is within the species scope filter.
func inScope(anchor labeledSeg, scope map[string]bool) bool {
	if len(scope) == 0 {
		return true
	}
	key := anchor.label.Species
	if anchor.label.CallType != "" {
		key += "+" + anchor.label.CallType
	}
	return scope[key] || scope[anchor.label.Species]
}

// findOverlappingMatches returns matches[model] = overlapping segments from that model,
// or nil if any model has no overlap (lonely anchor).
func findOverlappingMatches(anchor labeledSeg, models []string, modelSegs map[string][]labeledSeg) map[string][]labeledSeg {
	matches := make(map[string][]labeledSeg, len(models)-1)
	for _, model := range models[1:] {
		for _, candidate := range modelSegs[model] {
			if overlaps(anchor.seg, candidate.seg) {
				matches[model] = append(matches[model], candidate)
			}
		}
		if len(matches[model]) == 0 {
			return nil
		}
	}
	return matches
}

// buildComparisonGroup assembles anchor + first match per other model.
func buildComparisonGroup(anchor labeledSeg, models []string, matches map[string][]labeledSeg) []labeledSeg {
	group := []labeledSeg{anchor}
	for _, model := range models[1:] {
		group = append(group, matches[model][0])
	}
	return group
}

// checkGroupAnomaly checks a comparison group for label or certainty mismatches.
func checkGroupAnomaly(group []labeledSeg, path string, models []string) *Anomaly {
	refSpecies := group[0].label.Species
	refCallType := group[0].label.CallType
	for _, ls := range group[1:] {
		if ls.label.Species != refSpecies || ls.label.CallType != refCallType {
			a := Anomaly{File: path, Type: "label_mismatch", Segments: buildAnomalySegs(group, models)}
			return &a
		}
	}
	refCertainty := group[0].label.Certainty
	for _, ls := range group[1:] {
		if ls.label.Certainty != refCertainty {
			a := Anomaly{File: path, Type: "certainty_mismatch", Segments: buildAnomalySegs(group, models)}
			return &a
		}
	}
	return nil
}

func buildAnomalySegs(group []labeledSeg, models []string) []AnomalySegment {
	segs := make([]AnomalySegment, len(group))
	for i, ls := range group {
		segs[i] = AnomalySegment{
			Model:     models[i],
			Start:     ls.seg.StartTime,
			End:       ls.seg.EndTime,
			Species:   ls.label.Species,
			CallType:  ls.label.CallType,
			Certainty: ls.label.Certainty,
		}
	}
	return segs
}

// overlaps returns true if two segments share any time overlap.
func overlaps(a, b *utils.Segment) bool {
	return a.StartTime < b.EndTime && b.StartTime < a.EndTime
}