package tools
import (
"encoding/csv"
"fmt"
"io"
"os"
"path/filepath"
"slices"
"sort"
"strconv"
"strings"
"skraak/utils"
)
type CallsClipLabelsInput struct {
Folder string `json:"folder"`
MappingPath string `json:"mapping"`
Filter string `json:"filter,omitempty"`
OutputPath string `json:"output"`
ClipDuration float64 `json:"clip_duration"`
ClipOverlap float64 `json:"clip_overlap"`
MinLabelOverlap float64 `json:"min_label_overlap"`
FinalClip string `json:"final_clip"`
}
type CallsClipLabelsOutput struct {
Folder string `json:"folder"`
OutputPath string `json:"output"`
Filter string `json:"filter,omitempty"`
Classes []string `json:"classes"`
DataFilesParsed int `json:"data_files_parsed"`
ClipsNegative int `json:"clips_negative"` ClipsIgnored int `json:"clips_ignored"` SegmentsIgnored int `json:"segments_ignored"` ClipsAllFalseGap int `json:"clips_all_false_gap"` PerClassTrueCount map[string]int `json:"per_class_true_count"`
AppendedToFile bool `json:"appended_to_file"`
ExistingRowsFound int `json:"existing_rows_found"`
RowsWritten int `json:"rows_written"`
}
type resolvedSeg struct {
start, end float64
kind utils.MappingKind
classIdx int }
type clipDisposition int
const (
dispoLabelled clipDisposition = iota dispoNegative dispoGap dispoIgnored )
type clipLabelsRow struct {
file string
start float64
end float64
flags []bool
}
type rowKey struct {
file string
start string
end string
}
type parsedClipFile struct {
path string
df *utils.DataFile
}
func validateClipLabelsInput(input CallsClipLabelsInput) (utils.FinalClipMode, error) {
finalClipMode, err := utils.ParseFinalClipMode(input.FinalClip)
if err != nil {
return 0, err
}
if input.ClipDuration <= 0 {
return 0, fmt.Errorf("--clip-duration must be > 0, got %v", input.ClipDuration)
}
if input.ClipOverlap < 0 || input.ClipOverlap >= input.ClipDuration {
return 0, fmt.Errorf("--clip-overlap must be in [0, clip-duration), got %v", input.ClipOverlap)
}
if input.MinLabelOverlap <= 0 {
return 0, fmt.Errorf("--min-label-overlap must be > 0, got %v", input.MinLabelOverlap)
}
return finalClipMode, nil
}
func parseClipLabelsDataFiles(folder, filter string, mapping utils.MappingFile) ([]parsedClipFile, error) {
dataPaths, err := utils.FindDataFiles(folder)
if err != nil {
return nil, fmt.Errorf("scan folder %s: %w", folder, err)
}
if len(dataPaths) == 0 {
return nil, fmt.Errorf("no .data files found in %s", folder)
}
speciesSeen := map[string]bool{}
parsed := make([]parsedClipFile, 0, len(dataPaths))
for _, p := range dataPaths {
df, err := utils.ParseDataFile(p)
if err != nil {
return nil, fmt.Errorf("parse %s: %w", p, err)
}
if df.Meta == nil || df.Meta.Duration <= 0 {
return nil, fmt.Errorf("missing or non-positive Duration in %s (cannot generate clips)", p)
}
for _, seg := range df.Segments {
for _, lbl := range seg.Labels {
if filter != "" && lbl.Filter != filter {
continue
}
speciesSeen[lbl.Species] = true
}
}
parsed = append(parsed, parsedClipFile{path: p, df: df})
}
if missing := mapping.ValidateCoversSpecies(speciesSeen); len(missing) > 0 {
return nil, fmt.Errorf("mapping.json is missing entries for species: %s\n(run /data-mapping to regenerate)", strings.Join(missing, ", "))
}
return parsed, nil
}
func dedupClipLabelsRows(rows []clipLabelsRow, existing map[rowKey]bool) error {
dedup := make(map[rowKey]bool, len(existing)+len(rows))
for k := range existing {
dedup[k] = true
}
for _, r := range rows {
k := rowKey{file: r.file, start: formatTime(r.start), end: formatTime(r.end)}
if dedup[k] {
return fmt.Errorf("duplicate clip detected: file=%s start=%s end=%s", k.file, k.start, k.end)
}
dedup[k] = true
}
return nil
}
func CallsClipLabels(input CallsClipLabelsInput) (CallsClipLabelsOutput, error) {
out := CallsClipLabelsOutput{
Folder: input.Folder,
OutputPath: input.OutputPath,
PerClassTrueCount: map[string]int{},
}
finalClipMode, err := validateClipLabelsInput(input)
if err != nil {
return out, err
}
mapping, err := utils.LoadMappingFile(input.MappingPath)
if err != nil {
return out, fmt.Errorf("load mapping %s: %w", input.MappingPath, err)
}
classes := mapping.Classes()
if len(classes) == 0 {
return out, fmt.Errorf("mapping.json has no real (non-sentinel) classes")
}
out.Classes = classes
out.Filter = input.Filter
classIdx := map[string]int{}
for i, c := range classes {
classIdx[c] = i
}
parsed, err := parseClipLabelsDataFiles(input.Folder, input.Filter, mapping)
if err != nil {
return out, err
}
out.DataFilesParsed = len(parsed)
expectedHeader := append([]string{"file", "start_time", "end_time"}, classes...)
existing, appendMode, err := loadExistingRows(input.OutputPath, expectedHeader)
if err != nil {
return out, err
}
out.AppendedToFile = appendMode
out.ExistingRowsFound = len(existing)
cwd, err := os.Getwd()
if err != nil {
return out, fmt.Errorf("getwd: %w", err)
}
folderAbs, err := filepath.Abs(input.Folder)
if err != nil {
return out, fmt.Errorf("abs %s: %w", input.Folder, err)
}
rows := make([]clipLabelsRow, 0, 1024)
for _, pf := range parsed {
fileRows, err := processClipLabelsFile(pf.path, pf.df, mapping, classIdx, classes, input, finalClipMode, cwd, folderAbs, &out)
if err != nil {
return out, err
}
rows = append(rows, fileRows...)
}
if err := dedupClipLabelsRows(rows, existing); err != nil {
return out, err
}
if err := writeRows(input.OutputPath, expectedHeader, rows, appendMode); err != nil {
return out, err
}
out.RowsWritten = len(rows)
sort.Strings(out.Classes)
return out, nil
}
func processClipLabelsFile(
path string,
df *utils.DataFile,
mapping utils.MappingFile,
classIdx map[string]int,
classes []string,
input CallsClipLabelsInput,
finalClipMode utils.FinalClipMode,
cwd, folderAbs string,
out *CallsClipLabelsOutput,
) ([]clipLabelsRow, error) {
windows, err := utils.GenerateClipTimes(
df.Meta.Duration,
input.ClipDuration,
input.ClipOverlap,
finalClipMode,
10,
)
if err != nil {
return nil, fmt.Errorf("generate clip windows for %s: %w", path, err)
}
if len(windows) == 0 {
return nil, nil
}
segs := resolveSegments(df.Segments, input.Filter, input.MinLabelOverlap, mapping, classIdx, out)
rel, err := computeWavRelPath(path, cwd, folderAbs)
if err != nil {
return nil, err
}
return labelClipWindows(windows, segs, rel, classes, input.MinLabelOverlap, out), nil
}
func resolveSegments(
segments []*utils.Segment,
filter string,
minLabelOverlap float64,
mapping utils.MappingFile,
classIdx map[string]int,
out *CallsClipLabelsOutput,
) []resolvedSeg {
segs := make([]resolvedSeg, 0, len(segments))
for _, seg := range segments {
if seg.EndTime-seg.StartTime < minLabelOverlap {
continue
}
for _, lbl := range seg.Labels {
if filter != "" && lbl.Filter != filter {
continue
}
canon, kind, ok := mapping.Classify(lbl.Species)
if !ok {
continue
}
switch kind {
case utils.MappingIgn:
out.SegmentsIgnored++
segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})
case utils.MappingNeg:
segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind})
case utils.MappingReal:
idx, present := classIdx[canon]
if !present {
continue
}
segs = append(segs, resolvedSeg{start: seg.StartTime, end: seg.EndTime, kind: kind, classIdx: idx})
}
}
}
return segs
}
func computeWavRelPath(dataPath, cwd, folderAbs string) (string, error) {
wavName := strings.TrimSuffix(filepath.Base(dataPath), ".data")
wavAbs := filepath.Join(folderAbs, wavName)
rel, err := filepath.Rel(cwd, wavAbs)
if err != nil {
rel = wavAbs
}
if rel != "" && !filepath.IsAbs(rel) && !strings.HasPrefix(rel, "."+string(filepath.Separator)) {
rel = "." + string(filepath.Separator) + rel
}
return rel, nil
}
func labelClipWindows(windows []utils.ClipWindow, segs []resolvedSeg, rel string, classes []string, minLabelOverlap float64, out *CallsClipLabelsOutput) []clipLabelsRow {
var rows []clipLabelsRow
for _, w := range windows {
dispo, classHits := classifyClip(w, segs, minLabelOverlap, len(classes))
if dispo == dispoIgnored {
out.ClipsIgnored++
continue
}
row := clipLabelsRow{
file: rel,
start: w.Start,
end: w.End,
flags: make([]bool, len(classes)),
}
switch dispo {
case dispoNegative:
out.ClipsNegative++
case dispoGap:
out.ClipsAllFalseGap++
case dispoLabelled:
for i, hit := range classHits {
if hit {
row.flags[i] = true
out.PerClassTrueCount[classes[i]]++
}
}
}
rows = append(rows, row)
}
return rows
}
func classifyClip(w utils.ClipWindow, segs []resolvedSeg, minLabelOverlap float64, nClasses int) (clipDisposition, []bool) {
ignoreHit := false
negativeHit := false
classHits := make([]bool, nClasses)
for _, s := range segs {
if overlapSeconds(s.start, s.end, w.Start, w.End) < minLabelOverlap {
continue
}
switch s.kind {
case utils.MappingIgn:
ignoreHit = true
case utils.MappingNeg:
negativeHit = true
case utils.MappingReal:
classHits[s.classIdx] = true
}
}
if ignoreHit {
return dispoIgnored, nil
}
if negativeHit {
return dispoNegative, classHits
}
for _, hit := range classHits {
if hit {
return dispoLabelled, classHits
}
}
return dispoGap, classHits
}
func loadExistingRows(outputPath string, expectedHeader []string) (map[rowKey]bool, bool, error) {
fi, err := os.Stat(outputPath)
if err != nil {
if os.IsNotExist(err) {
return nil, false, nil
}
return nil, false, fmt.Errorf("stat %s: %w", outputPath, err)
}
if fi.Size() == 0 {
return nil, false, nil
}
f, err := os.Open(outputPath)
if err != nil {
return nil, false, fmt.Errorf("open existing %s: %w", outputPath, err)
}
defer func() { _ = f.Close() }()
r := csv.NewReader(f)
r.FieldsPerRecord = -1
header, err := r.Read()
if err != nil {
return nil, false, fmt.Errorf("read header of existing %s: %w", outputPath, err)
}
if !slices.Equal(header, expectedHeader) {
return nil, false, fmt.Errorf("column-set mismatch in existing %s\n existing: %s\n new: %s",
outputPath, strings.Join(header, ","), strings.Join(expectedHeader, ","))
}
existing := map[rowKey]bool{}
for {
rec, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, false, fmt.Errorf("read row of existing %s: %w", outputPath, err)
}
if len(rec) < 3 {
return nil, false, fmt.Errorf("malformed row in existing %s: %v", outputPath, rec)
}
existing[rowKey{file: rec[0], start: rec[1], end: rec[2]}] = true
}
return existing, true, nil
}
func overlapSeconds(aStart, aEnd, bStart, bEnd float64) float64 {
lo := max(aStart, bStart)
hi := min(aEnd, bEnd)
if hi <= lo {
return 0
}
return hi - lo
}
func formatTime(v float64) string {
s := strconv.FormatFloat(v, 'f', -1, 64)
if !strings.ContainsRune(s, '.') {
s += ".0"
}
return s
}
func writeRows(path string, header []string, rows []clipLabelsRow, appendMode bool) error {
var f *os.File
var err error
if appendMode {
f, err = os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644)
} else {
f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
}
if err != nil {
return fmt.Errorf("open %s for write: %w", path, err)
}
defer func() { _ = f.Close() }()
w := csv.NewWriter(f)
if !appendMode {
if err := w.Write(header); err != nil {
return fmt.Errorf("write header: %w", err)
}
}
if len(rows) == 0 {
w.Flush()
return w.Error()
}
rec := make([]string, 3+len(rows[0].flags))
for _, r := range rows {
rec[0] = r.file
rec[1] = formatTime(r.start)
rec[2] = formatTime(r.end)
for i, b := range r.flags {
if b {
rec[3+i] = "True"
} else {
rec[3+i] = "False"
}
}
if err := w.Write(rec); err != nil {
return fmt.Errorf("write row: %w", err)
}
}
w.Flush()
return w.Error()
}