FCL6FKHMM6LX7HNI7F3CH4GRUIXCC2APQIZUKTJSLSIS6SRP2SPQC # Train.jlexport train #beware Flux.train! is not Skraak.trainimport Base: length, getindeximport MLBaseusing CUDA, Dates, Images, Flux, Glob, JLD2, Noiseusing Random: shuffle!, seed!using Metalhead: ResNet#=function train(model_name::String,train_epochs::Int64,images::Vector{String},pretrain::Model=true,train_test_split::Float64 = 0.8,batch_size::Int64 = 64,)Note:Dont forget temp env, julia -t 4Assumes 224x224 pixel RGB images as png'sSaves jld2's in current directoryUse like:using Skraak, Globimages = Glob.glob("kiwi_set*/*/[N,K]/*.png") #11699814-element Vector{String}model = "/media/david/SSD2/PrimaryDataset/model_K1-9_original_set_CPU_epoch-7-0.9924-2024-03-05.jld2"train("K1-10_total_set_no_augumentation", 2, images, model, 0.97, 64)images = Glob.glob("*/[D,F,M,N]/*.png") #from SSD2/Clipsmodel = "/media/david/SSD2/PrimaryDataset/model_K1-5_CPU_epoch-6-0.9795-2023-12-16.jld2"train("DFMN1-5", 20, images, model)=#const LABELTOINDEX::Dict{String,Int32} = Dict()Model = Union{Bool,String}function train(model_name::String,train_epochs::Int64,images::Vector{String}, #glob_pattern::String = "*/*.png"pretrain::Model = true,train_test_split::Float64 = 0.8,batch_size::Int64 = 64,)epochs = 1:train_epochs#images = Glob.glob(glob_pattern) #|> shuffle! |> x -> x[1:640]@assert !isempty(images) "No png images found"@info "$(length(images)) images in dataset"label_to_index = labels_to_dict(images)register_label_to_index!(label_to_index)@info "Text labels translate to: " label_to_indexclasses = length(label_to_index)@assert classes >= 2 "At least 2 label classes are required, for example: kiwi, not_kiwi"@info "$classes classes in dataset"@info "Device: $device"ceiling = seil(length(images), batch_size)train_test_index = train_test_idx(ceiling, batch_size, train_test_split)train, train_sample, test = process_data(images, train_test_index, ceiling, batch_size)@info "Made data loaders"model = load_model(pretrain, classes)@info "Loaded model"opt = Flux.setup(Flux.Optimisers.Adam(1e-5), model)@info "Setup optimiser"@info "Training for $epochs epochs: " now()training_loop!(model,opt,train,train_sample,test,epochs,model_name,classes,label_to_index,)@info "Finished $(last(epochs)) epochs: " now()endstruct ImageContainer{T<:Vector}img::Tendstruct ValidationImageContainer{T<:Vector}img::TendContainer = Union{ImageContainer,ValidationImageContainer}function seil(n::Int, batch_size::Int)return n ÷ batch_size * batch_sizeendfunction train_test_idx(ceiling::Int, batch_size::Int, train_test_split::Float64)::Intt =#! format: offceiling ÷ batch_size * train_test_split |>round |>x -> x * batch_size |>x -> convert(Int, x)#! format: onendfunction labels_to_dict(list::Vector{String})::Dict{String,Int32}l =#! format: offmap(x -> split(x, "/")[end-1], list) |>unique |>sort |>x -> zip(x, 1:length(x)) |>Dict#! format: onreturn lend"""register_label_to_index!(label_to_index::Dict{String,Int32})This will replace the content of the global variable LABELTOINDEXwith the content intended by the caller.Thanks algunionhttps://discourse.julialang.org/t/dataloader-scope-troubles/105207/4"""function register_label_to_index!(label_to_index::Dict{String,Int32})empty!(LABELTOINDEX)merge!(LABELTOINDEX, label_to_index)enddevice = CUDA.functional() ? gpu : cpufunction process_data(array_of_file_names, train_test_index, ceiling, batch_size)seed!(1234)images = shuffle!(array_of_file_names)train =ImageContainer(images[1:train_test_index]) |> x -> make_dataloader(x, batch_size)train_sample =ValidationImageContainer(images[1:(ceiling-train_test_index)]) |>x -> make_dataloader(x, batch_size)test =ValidationImageContainer(images[train_test_index+1:ceiling]) |>x -> make_dataloader(x, batch_size)return train, train_sample, testendlength(data::ImageContainer) = length(data.img)length(data::ValidationImageContainer) = length(data.img)function getindex(data::ImageContainer{Vector{String}}, index::Int)path = data.img[index]img =#! format: offImages.load(path) |>#x -> Images.imresize(x, 224, 224) |>#x -> Images.RGB.(x) |>x -> Noise.add_gauss(x, (rand() * 0.2)) |>x -> apply_mask!(x, 3, 3, 12) |>x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: ony = LABELTOINDEX[(split(path, "/")[end-1])]return img, yendfunction getindex(data::ValidationImageContainer{Vector{String}}, index::Int)path = data.img[index]img =#! format: offImages.load(path) |>#x -> Images.imresize(x, 224, 224) |>#x -> Images.RGB.(x) |>x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: ony = LABELTOINDEX[(split(path, "/")[end-1])]return img, yend# assumes 224px square imagesfunction apply_mask!(img::Array{RGB{N0f8},2},max_number::Int = 3,min_size::Int = 3,max_size::Int = 22,)# horizontalfor range in get_random_ranges(max_number, min_size, max_size)img[range, :] .= RGB{N0f8}(0.7, 0.7, 0.7)end# verticalfor range in get_random_ranges(max_number, min_size, max_size)img[:, range] .= RGB{N0f8}(0.7, 0.7, 0.7)endreturn imgend# assumes 224px square imagesfunction get_random_ranges(max_number::Int, min_size::Int, max_size::Int)number = rand(0:max_number)ranges = []while length(ranges) < numberstart = rand(1:224)size = rand(min_size:max_size)if start + size > 224continueendpush!(ranges, start:start+size)endreturn rangesendfunction make_dataloader(container::Container, batch_size::Int)data =Flux.DataLoader(container; batchsize = batch_size, collate = true, parallel = true)device == gpu ? data = CuIterator(data) : nothingreturn dataend# see load_model() from predict, and belowfunction load_model(pretrain::Bool, classes::Int64)fst = Metalhead.ResNet(18, pretrain = pretrain).layerslst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => classes))model = Flux.Chain(fst[1], lst) |> devicereturn modelend#If model classes == desired classes I don't empty the last layer#That means that I can just train from where I left off for new data, DFMN model#Could be a gotcha if I want to train a different 4 class model, no need for a switch just yetfunction load_model(model_path::String, classes::Int64)model_state = JLD2.load(model_path, "model_state")model_classes = length(model_state[1][2][1][3][2])f = Metalhead.ResNet(18, pretrain = false).layersl = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))m = Flux.Chain(f[1], l)Flux.loadmodel!(m, model_state)if classes == model_classesmodel = m |> deviceelsefst = m.layerslst = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => classes))model = Flux.Chain(fst[1], lst) |> deviceendreturn modelendfunction evaluate(m, d, c)good = 0count = 0pred = Int64[]actual = Int64[]for (x, y) in dp = Flux.onecold(m(x))good += sum(p .== y)count += length(y)append!(pred, p)append!(actual, y)endaccuracy = round(good / count, digits = 4)confusion_matrix = MLBase.confusmat(c, actual, pred)#freqtable(DataFrames.DataFrame(targets = actual, predicts = pred), :targets, :predicts)#roc=MLBase.roc(actual, pred, 100)#f1=MLBase.f1score(roc)return accuracy, confusion_matrix #, roc, f1endfunction train_epoch!(model; opt, train, classes)Flux.train!(model, train, opt) do m, x, yFlux.Losses.logitcrossentropy(m(x), Flux.onehotbatch(y, 1:classes))endendfunction dict_to_text_file(dict, model_name)text = ""for (key, value) in dicttext = text * "$(key) => $(value)\n"endopen("labels_$(model_name)-$(today()).txt", "w") do filewrite(file, text)end@info "Saved labels to file for future reference"endfunction training_loop!(model,opt,train,train_sample,test,epochs::UnitRange{Int64},model_name::String,classes,label_to_index,)@time eval, vcm = evaluate(model, test, classes)@info "warm up accuracy" accuracy = eval@info "warm up confusion matrix" vcma = 0for epoch in epochsprintln("")@info "Starting Epoch: $epoch"epoch == 1 && dict_to_text_file(label_to_index, model_name)@time train_epoch!(model; opt, train, classes)@time train_accuracy, train_confusion_matrix =evaluate(model, train_sample, classes)@info "Epoch: $epoch"@info "train" accuracy = train_accuracy@info "train" train_confusion_matrix@time test_accuracy, test_confusion_matrix = evaluate(model, test, classes)@info "test" accuracy = test_accuracy@info "test" test_confusion_matrix# number kiwi guessed right, assumes kiwi=1, not=2 (alphabetical)#test_confusion_matrix[1,1] > a && begin#a = test_confusion_matrix[1,1]let _model = cpu(model)jldsave("/media/david/SSD2/model_$(model_name)_CPU_epoch-$epoch-$test_accuracy-$(today()).jld2";model_state = Flux.state(_model),)@info "Saved a best_model"end#endendend
module SkraakMLgreet() = print("Hello World!")end # module SkraakML
# Predict.jlexport predictexport get_images_from_audiousing WAV,DSP, Images, ThreadsX, Dates, DataFrames, CSV, Flux, CUDA, Metalhead, JLD2, FLAC, Globimport Base: length, getindex##Dependency, duplicated from Utilityfunction _resample_to_16000hz(signal, freq)signal = DSP.resample(signal, 16000.0f0 / freq; dims = 1)freq = 16000return signal, freqend##Dependency, duplicated from Clipsfunction _get_image_from_sample(sample, f) #sample::Vector{Float64}S = DSP.spectrogram(sample, 400, 2; fs = convert(Int, f))i = S.powerif minimum(i) == 0.0l = i |> vec |> unique |> sortreplace!(i, 0.0 => l[2])endimage =#! format: offDSP.pow2db.(i) |>x -> x .+ abs(minimum(x)) |>x -> x ./ maximum(x) |>x -> reverse(x, dims = 1) |>x -> PerceptualColourMaps.applycolourmap(x, cmap("L4")) |>#x -> RGB.(x) |>x -> imresize(x, 224, 224) |>x -> Float32.(x)#! format: onreturn imageend"""predict(glob_pattern::String, model::String)This function takes a glob pattern for folders (or a vector of folders) to run over, and a model path. It saves results in a csv in each folder, similar to opensoundscapeArgs:• glob pattern (folder/) or a vector of folders• model pathReturns: Nothing - This function saves csv files.I use this function to find kiwi from new data gathered on a trip. And to predict D/F/M/N for images clipped from primary detections.It works on both audio (wav or flac) and png images.Note:From Pomona-3/Pomona-3/julia -t 4Dont forget temp environment: ] activate --tempUse like:using Skraakglob_pattern = "*/*/"model = "/media/david/SSD2/PrimaryDataset/model_K1-9_original_set_CPU_epoch-7-0.9924-2024-03-05.jld2"glob_pattern = "Clips_2024-10-21/"model = "/media/david/SSD1/Clips/model_DFMN1-5_CPU_epoch-18-0.9132-2024-01-29.jld2"predict(glob_pattern, model)"""function predict(glob_pattern::String, model::String)model = load_model_pred(model) |> devicefolders = Glob.glob(glob_pattern)@info "Folders: $folders"for folder in folders@info "Working on: $folder"predict_folder(folder, model)endendfunction predict(folders::Vector{String}, model::String)model = load_model_pred(model) |> device@info "Folders: $folders"for folder in folders@info "Working on: $folder"predict_folder(folder, model)endend#~~~~~ The guts ~~~~~## see load_model() from train, different input typesfunction load_model_pred(model_path::String)model_state = JLD2.load(model_path, "model_state")model_classes = length(model_state[1][2][1][3][2])@info "Model classes: $model_classes"f = Metalhead.ResNet(18, pretrain = false).layersl = Flux.Chain(AdaptiveMeanPool((1, 1)), Flux.flatten, Dense(512 => model_classes))model = Flux.Chain(f[1], l)Flux.loadmodel!(model, model_state)return modelend#=function load_bson(model_path::String)BSON.@load model_path modelend=#function predict_folder(folder::String, model)wav = Glob.glob("$folder/*.[W,w][A,a][V,v]")flac = Glob.glob("$folder/*.flac")audio_files = vcat(wav, flac) #if wav and flac both present will predict on allpng_files = Glob.glob("$folder/*.png")#it will predict on images when both images and audio presentif isempty(png_files)length(audio_files) > 0 ? predict_audio_folder(audio_files, model, folder) :@info "No png, flac, wav, WAV files present in $folder"elsepredict_image_folder(png_files, model, folder)endenddevice = CUDA.functional() ? gpu : cpu# Predict from png imagesstruct PredictImageContainer{T<:Vector}img::Tendlength(data::PredictImageContainer) = length(data.img)function getindex(data::PredictImageContainer{Vector{String}}, idx::Int)path = data.img[idx]img =#! format: offImages.load(path) |>x -> Images.imresize(x, 224, 224)|>x -> Images.RGB.(x) |>x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: onreturn img, pathendfunction predict_image_folder(png_files::Vector{String}, model, folder::String)l = length(png_files)@assert (l > 0) "No png files present in $folder"@info "$(l) png_files in $folder"save_path = "$folder/preds-$(today()).csv"loader = png_loader(png_files)@time preds, files = predict_pngs(model, loader)f = split.(files, "/") |> x -> last.(x)df = DataFrames.DataFrame(file = f, label = preds)CSV.write("$save_path", df)endfunction png_loader(png_files::Vector{String})loader = Flux.DataLoader(PredictImageContainer(png_files);batchsize = 64,collate = true,parallel = true,)device == gpu ? loader = CuIterator(loader) : nothingreturn loaderendfunction predict_pngs(m, d)@info "Predicting..."pred = []path = []for (x, pth) in dp = Flux.onecold(m(x))append!(pred, p)append!(path, pth)endreturn pred, pathend# Predict from audio filesfunction predict_audio_folder(audio_files::Vector{String}, model, folder::String)l = length(audio_files)@assert (l > 0) "No wav or flac audio files present in $folder"@info "$(l) audio_files in $folder"df = DataFrames.DataFrame(file = String[],start_time = Float64[],end_time = Float64[],label = Int[],)save_path = "$folder/preds-$(today()).csv"CSV.write("$save_path", df)for file in audio_filesdf = predict_audio_file(file, model)CSV.write("$save_path", df, append = true)endendfunction predict_audio_file(file::String, model)#check form of opensoundscape preds.csv and needed by my make_clips@info "File: $file"@time data = audio_loader(file)pred = []time = []@time for (x, t) in datap = Flux.onecold(model(x))append!(pred, p)append!(time, t)endf = (repeat(["$file"], length(time)))df = DataFrames.DataFrame(:file => f,:start_time => first.(time),:end_time => last.(time),:label => pred,)sort!(df)return dfendfunction audio_loader(file::String, increment::Int = 5, divisor::Int = 2)raw_images, n_samples = get_images_from_audio(file::String, increment, divisor)images = reshape_images(raw_images, n_samples)# Start time and end time for each 5s audio clip, in seconds relative to the start of the file.start_time = 0:(increment/divisor):(n_samples-1)*(increment/divisor)end_time = increment:(increment/divisor):(n_samples+1)*(increment/divisor)time = collect(zip(start_time, end_time))loader = Flux.DataLoader((images, time), batchsize = n_samples, shuffle = false)device == gpu ? loader = CuIterator(loader) : nothing #check this works with gpureturn loaderendfunction reshape_images(raw_images, n_samples)images =#! format: offhcat(raw_images...) |>x -> reshape(x, (224, 224, 3, n_samples))#! format: onreturn imagesend#= not neededfunction get_image_for_inference(sample, f)image =#! format: off_get_image_from_sample(sample, f) |># x -> collect(channelview(float32.(x))) |>x -> permutedims(x, (3, 2, 1))#! format: onreturn imageend=## need to change divisor to a overlap fraction, chech interaction with audioloader()# if divisor is 0, then no overlap atmfunction get_images_from_audio(file::String, increment::Int = 5, divisor::Int = 2) #5s sample, 2.5s hopsignal, freq = load_audio_file(file)if freq > 16000signal, freq = _resample_to_16000hz(signal, freq)endf = convert(Int, freq)inc = increment * f#hop = f * increment ÷ divisor #need guarunteed Int, maybe not anymore, refactorhop = 0 #f * increment / divisor |> x -> x == Inf ? 0 : trunc(Int, x)split_signal = DSP.arraysplit(signal[:, 1], inc, hop)raw_images = ThreadsX.map(x -> _get_image_from_sample(x, f), split_signal)n_samples = length(raw_images)return raw_images, n_samplesendfunction load_audio_file(file::String)ext = split(file, ".")[end]@assert ext in ["WAV", "wav", "flac"] "Unsupported audio file type, requires wav or flac."if ext in ["WAV", "wav"]signal, freq = WAV.WAV.wavread(file)elsesignal, freq = load(file)end@assert !isempty(signal[:, 1]) "$file seems to be empty, could it be corrupted?\nYou could delete it, or replace it with a known\ngood version from SD card or backup."return signal, freqend############### PYTHON Opensoundscape #################=# Python 3.8.12, opensoundscape 0.7.1# Dont forget conda activate opensoundscape# Dont forget to modify file names and glob pattern# Run script in Pomona-2, hard code trip date in the glob# python /media/david/USB/Skraak/src/predict.pyfrom opensoundscape.torch.models.cnn import load_modelimport opensoundscapeimport torchfrom pathlib import Pathimport numpy as npimport pandas as pdfrom glob import globimport osfrom datetime import datetimemodel = load_model('/home/david/best.model0')# folders = Glob.glob('./*/2023-?????/')# folders = Glob.glob('./*/*/2024-05-0?')folders = Glob.glob('./*/2024-10-18/')for folder in folders:os.chdir(folder)print(folder, ' start: ', datetime.now())# Beware, secretary island files are .wavfield_recordings = Glob.glob('./*.[W,w][A,a][V,v]')scores, preds, unsafe = model.predict(field_recordings,binary_preds = 'single_target',overlap_fraction = 0.5,batch_size = 128,num_workers = 12)scores.to_csv("scores-2024-10-21.csv")preds.to_csv("preds-2024-10-21.csv")os.chdir('../..') # Be careful this matches the glob on line 284print(folder, ' done: ', datetime.now())print()print()=##=Kahurangifolders = Glob.glob('./*/')for folder in folders:os.chdir(folder)print(folder, ' start: ', datetime.now())# Beware, secretary island files are .wavfield_recordings = Glob.glob('./*.[W,w][A,a][V,v]')scores, preds, unsafe = model.predict(field_recordings,binary_preds = 'single_target',overlap_fraction = 0.5,batch_size = 128,num_workers = 12)scores.to_csv("scores-2024-10-21.csv")preds.to_csv("preds-2024-10-21.csv")os.chdir('./..') # Be careful this matches the glob on line 284print(folder, ' done: ', datetime.now())print()print()=#
# SkraakIdentify bird calls using AI, and monitor call frequency.__Skraak is intended to be simple to use for simple people like me.__This package serves [skraak.kiwi](https://skraak.kiwi).Most of the skraak.kiwi data has been recorded using Open Acoustics AudioMoth's or μMoth's at 16000 Hz. DOC recorders at 8000hz work fine.It is a good idea to use an Nvidia GPU. Everything should work fine on CPU, just slow.AMD and Mac Silicone GPU's are not supported but should be easy for you to get working with julia AMD or Metal packages.If you are doing serious work, start the julia repl with: julia -t n where n is up to 1/2 the number of cores you have. I do 4, this is enough to keep up with a gamer style GPU.__You can use Skraak too.__```[Install Julia](https://julialang.org/downloads/platform/), Julia-1.10 or newer[git clone the Skraak project](https://github.com/quietlight/Skraak), if you dont have git or the git cli, you can download a zip file by clicking the <code> button.cd to your Skraak folderstart the julia repl with $julia(You will want to install Revise and OhMyREPL, just do 'using Revise, OhMyREPL' in the Julia repl, add 'using Revise, OhMyREPL' to ~/.julia/config/startup.jl)type: ] (to enter Pkg mode)type: activate .type: instantiatebackspace to exit Pkg modeexit repl with ctrl-D.```Later:```start the julia repl with $juliatype: ] (to enter Pkg mode)type: dev path/to/Skraak (to make it a local package)backspace to exit Pkg modetype: using Skraak, Glob (glob is only here to help you refine your glob patterns)WORK...When finished working you can if you like do 'free Skraak' in Pkg mode (accessed with ']')```1. Take some WAV's organised into a file structure LOCATION/TRIP_DATE/WAV_FILES2. and labels saved in a csv in the form:* file(String),start_time,end_time,label(Integer) (where start_time and end_time are in seconds from the start of the wav file)* at least 2 label classes are required, for example Kiwi, Not3. Generate a primary dataset of spectrogram images with the following file structure:* DATASET/AUDIO_FILE*/LABEL*/PNG's (png files must be 224X224 px square RGB).* This structure is required, when training, __the parent folder of a file is the label__.* This function creates a folder for each file, creates subfolders for each label, then saves png files in the appropriate label sub folder.* Space is needed. It uses the whole audio file. (I aim for 96% Not, 4% Kiwi)* and saves a flac copy for reference> I use labels, [K, N] in words [Kiwi, Not]. Anything will work, the unique text labels are sorted alphabetically and mapped to integer labels in the training process.> More than 2 label classes is fine, but keep it simple until you have a lot of data.> It is better __not__ to have everything in big folders, 100_000 files in a folder on a Fat32 removable drive will rapidly grind to a stand still.> You could have many thousands of K and N folders, for example, the model does not care.> Native file systems on mac/linux will work ok. I use ext4 (linux) file systems on exteranl SSD's for both linux and mac.``````4. Train a Resnet18 model, either pretrained on Imagenet, or preferably the pretrained Skraak Kiwi model, which is currently trained on 7_700_000 images.Skraak trains on 5 second clips, converted to 224x224 pixel RGB spectrogram images.```using Skraakglob_pattern_1 = "Clips*/[D,F,M,N]/*.png" #for example. Note: requires png's as input.glob_pattern_2 = "Dataset*/[K, N]/*.png"# Train a model named Test1 for 2 epochs on png files found by glob_pattern,# start with a pretrained model.train("Test1", 2, glob_pattern_1, true)# Train a model named Test2 for 2 epochs on png files found by glob_pattern,# train using model found at "path/to/model.jld2"train("Test2", 2, glob_pattern_2, "path/to/model.jld2")# Note: Your unique text labels are sorted alphabetically, and converted to# integers, [1,2,3...] to be consumed by the flux model# A text file will be saved beside the model.jld2, with the label to# integer mapping.```5. Run inference on raw data using a trained modelSkraak will try to find png images first, in the folders covered by the glob pattern. If there are no png's found it will predict on wav or flac files, using 5 second audio clips, converted to 224x224 pixel RGB spectrogram images, with a 2.5 second hop.> You are responsible for providing an appropriate model.> I use a binary Kiwi/Not model for finding calls in audio data, and a Duet/Female/Male/Not model on png clips made from calls detected by the binary model.> Find some models to start with in the Models folder```using Skraakglob_pattern = "*/*/" #Note: requires folders as input. Folders contain flac, wav or png files.# Predict label classes of png, wav or flac files found in folders specified by# glob_pattern using model.jld2. A preds.csv file is saved in current directorypredict(glob_pattern, "path/to/model.jld2")```6. Generate audio clips and spectrogram images of all calls found.```# Make clips from a preds.csv file of the form:# file(String),start_time,end_time,label(Int)# 1 is the label, it can be any int present in the label field of preds.csv# It saves clips in a folder 'Clips_2023-11-09'make_clips("preds.csv", 1)```7. Sort calls into subclasses (say: Duet, Female, Male, Nothing) manually, or using a model combined with human supervision. TODO8. Store data from calls and file metadata in a DuckDB database for statistical analysis using SQL, DataFrames, Plots.```I will not document this until the DuckDB storage api has stabilised.For now always store a csv backup using "EXPORT DATABASE 'Backup_2023-10-10';" in the duckdb cli.I highly recommend storing data in a duckdb database.Querying a duckdb database with SQL is faster than even julia DataFrames, both leave Pandas in the dust.```9. Repeat, iterating on your models as you accumulate more data. It's hard until it gets easy.Managing datasets is like gardening, it takes some weeding and a _lot_ of compost (aka data) to get a good model growing.Julia is great for machine learning because it is realtively simple to get a GPU working. It does have disadvantages at GPT-4 scale, but for this kind of work it is excelent. Julia shines with any scientific computing task.
name = "SkraakML"uuid = "960381bc-3737-4297-a0a0-71f7f33f3c12"authors = ["David Cary <cdecary@gmail.com>"]version = "0.1.0"[deps]CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"FLAC = "abae9e3b-a9a0-4778-b5c6-ca109b507d99"Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"ImageTransformations = "02fcd773-0e25-5acc-982a-7f6622650795"JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"MLBase = "f0e99cf1-93fa-52ec-9ecc-5026115318e0"Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"Noise = "81d43f40-5267-43b7-ae1c-8b967f377efa"PerceptualColourMaps = "54e51dfa-9dd7-5231-aa84-a4037b83483a"Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"WAV = "8149f6b0-98f6-5db9-b78f-408fbbb8ef88"cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
MIT LicenseCopyright (c) 2023 David Cary <cdecary@gmail.com> and contributorsPermission is hereby granted, free of charge, to any person obtaining a copyof this software and associated documentation files (the "Software"), to dealin the Software without restriction, including without limitation the rightsto use, copy, modify, merge, publish, distribute, sublicense, and/or sellcopies of the Software, and to permit persons to whom the Software isfurnished to do so, subject to the following conditions:The above copyright notice and this permission notice shall be included in allcopies or substantial portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THEAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THESOFTWARE.