quietlight/skraakCLI: tools/calls_from

package tools

import (
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"sync"
	"sync/atomic"
)

// CallsFromSourceInput defines the common input for calls-from-source tools
type CallsFromSourceInput struct {
	Folder          string          `json:"folder"`
	File            string          `json:"file"`
	Delete          bool            `json:"delete"`
	ProgressHandler ProgressHandler `json:"-"` // Optional progress callback
}

// CallsFromSourceOutput defines the common output for calls-from-source tools
type CallsFromSourceOutput struct {
	Calls            []ClusteredCall `json:"calls"`
	TotalCalls       int             `json:"total_calls"`
	SpeciesCount     map[string]int  `json:"species_count"`
	DataFilesWritten int             `json:"data_files_written"`
	DataFilesSkipped int             `json:"data_files_skipped"`
	FilesProcessed   int             `json:"files_processed"`
	FilesDeleted     int             `json:"files_deleted"`
	Filter           string          `json:"filter"`
	Error            *string         `json:"error,omitempty"`
}

// CallSource abstracts a source of bird call data (Raven, BirdNET, etc.)
type CallSource interface {
	// Name returns the display name (e.g. "Raven", "BirdNET")
	Name() string
	// FindFiles discovers source files in the given folder
	FindFiles(folder string) ([]string, error)
	// ProcessFile processes a single source file and returns calls, write/skip status
	ProcessFile(path string, cache *DirCache) (calls []ClusteredCall, written, skipped bool, err error)
}

// callsFromSource is the shared entry point for all call source tools.
func callsFromSource(src CallSource, input CallsFromSourceInput) (CallsFromSourceOutput, error) {
	var output CallsFromSourceOutput
	output.Filter = src.Name()

	// Collect source files to process
	var files []string
	if input.File != "" {
		files = []string{input.File}
	} else if input.Folder != "" {
		var err error
		files, err = src.FindFiles(input.Folder)
		if err != nil {
			errMsg := fmt.Sprintf("Failed to find %s files: %v", src.Name(), err)
			output.Error = &errMsg
			return output, fmt.Errorf("%s", errMsg)
		}
	} else {
		errMsg := "Either --folder or --file must be specified"
		output.Error = &errMsg
		return output, fmt.Errorf("%s", errMsg)
	}

	if len(files) == 0 {
		errMsg := fmt.Sprintf("No %s files found", src.Name())
		output.Error = &errMsg
		return output, fmt.Errorf("%s", errMsg)
	}

	// Single file or small batch: process sequentially (avoid goroutine overhead)
	if len(files) < 10 {
		return callsFromSourceSequential(src, input, files)
	}

	// Large batch: parallel processing with DirCache
	return callsFromSourceParallel(src, input, files)
}

// callsFromSourceSequential processes source files one at a time (for small batches)
func callsFromSourceSequential(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {
	var output CallsFromSourceOutput
	output.Filter = src.Name()

	// Build DirCache once for the folder
	dirCaches := make(map[string]*DirCache)
	if input.Folder != "" {
		dirCaches[input.Folder] = NewDirCache(input.Folder)
	}

	speciesCount := make(map[string]int)
	var allCalls []ClusteredCall
	dataFilesWritten := 0
	dataFilesSkipped := 0
	filesProcessed := 0
	filesDeleted := 0

	for _, file := range files {
		dir := filepath.Dir(file)
		cache := dirCaches[dir]
		if cache == nil {
			cache = NewDirCache(dir)
			dirCaches[dir] = cache
		}

		calls, written, skipped, err := src.ProcessFile(file, cache)
		if err != nil {
			errMsg := fmt.Sprintf("Error processing %s: %v", file, err)
			output.Error = &errMsg
			return output, fmt.Errorf("%s", errMsg)
		}

		if written {
			dataFilesWritten++
		}
		if skipped {
			dataFilesSkipped++
		}

		for _, call := range calls {
			allCalls = append(allCalls, call)
			speciesCount[call.EbirdCode]++
		}

		filesProcessed++

		// Delete if requested and successfully processed
		if input.Delete && written {
			if err := os.Remove(file); err != nil {
				errMsg := fmt.Sprintf("Failed to delete %s: %v", file, err)
				output.Error = &errMsg
				return output, fmt.Errorf("%s", errMsg)
			}
			filesDeleted++
		}

		if input.ProgressHandler != nil {
			input.ProgressHandler(filesProcessed, len(files), filepath.Base(file))
		}
	}

	// Sort all calls by file, then start time
	sort.Slice(allCalls, func(i, j int) bool {
		if allCalls[i].File != allCalls[j].File {
			return allCalls[i].File < allCalls[j].File
		}
		return allCalls[i].StartTime < allCalls[j].StartTime
	})

	output.Calls = allCalls
	output.TotalCalls = len(allCalls)
	output.SpeciesCount = speciesCount
	output.DataFilesWritten = dataFilesWritten
	output.DataFilesSkipped = dataFilesSkipped
	output.FilesProcessed = filesProcessed
	output.FilesDeleted = filesDeleted

	return output, nil
}

// sourceJob represents a single file to process (generic over CallSource)
type sourceJob struct {
	filePath string
}

// sourceResult represents the result of processing a single source file
type sourceResult struct {
	path    string
	calls   []ClusteredCall
	written bool
	skipped bool
	err     error
}

func (r sourceResult) filePath() string          { return r.path }
func (r sourceResult) getCalls() []ClusteredCall { return r.calls }
func (r sourceResult) wasWritten() bool          { return r.written }
func (r sourceResult) wasSkipped() bool          { return r.skipped }
func (r sourceResult) getError() error           { return r.err }

// callsFromSourceParallel processes source files concurrently using a worker pool and DirCache
func callsFromSourceParallel(src CallSource, input CallsFromSourceInput, files []string) (CallsFromSourceOutput, error) {
	var output CallsFromSourceOutput
	output.Filter = src.Name()

	total := len(files)
	var processed atomic.Int32

	// Build DirCache for the folder
	dirCaches := &sync.Map{}
	if input.Folder != "" {
		cache := NewDirCache(input.Folder)
		dirCaches.Store(input.Folder, cache)
	}

	// Create job and result channels
	jobs := make(chan sourceJob, total)
	results := make(chan parallelResult, total)

	// Start workers
	var wg sync.WaitGroup
	for range DOT_DATA_WORKERS {
		wg.Add(1)
		go sourceWorker(src, dirCaches, jobs, results, &wg)
	}

	// Send jobs
	for _, file := range files {
		jobs <- sourceJob{filePath: file}
	}
	close(jobs)

	// Wait for workers to finish, then close results
	go func() {
		wg.Wait()
		close(results)
	}()

	// Collect results with progress reporting
	stats := aggregateResults(results, total, &processed, input.Delete, input.ProgressHandler)

	if stats.firstErr != nil {
		errMsg := stats.firstErr.Error()
		output.Error = &errMsg
		return output, stats.firstErr
	}

	sortCallsByFileAndTime(stats.calls)

	output.Calls = stats.calls
	output.TotalCalls = len(stats.calls)
	output.SpeciesCount = stats.speciesCount
	output.DataFilesWritten = stats.dataFilesWritten
	output.DataFilesSkipped = stats.dataFilesSkipped
	output.FilesProcessed = stats.filesProcessed
	output.FilesDeleted = stats.filesDeleted

	return output, nil
}

// sourceWorker processes source files from the jobs channel
func sourceWorker(src CallSource, dirCaches *sync.Map, jobs <-chan sourceJob, results chan<- parallelResult, wg *sync.WaitGroup) {
	defer wg.Done()

	for job := range jobs {
		dir := filepath.Dir(job.filePath)

		// Get or create DirCache for this directory
		var cache *DirCache
		if cached, ok := dirCaches.Load(dir); ok {
			cache = cached.(*DirCache)
		} else {
			cache = NewDirCache(dir)
			dirCaches.Store(dir, cache)
		}

		calls, written, skipped, err := src.ProcessFile(job.filePath, cache)
		results <- sourceResult{
			path:    job.filePath,
			calls:   calls,
			written: written,
			skipped: skipped,
			err:     err,
		}
	}
}