UYYSWKUVHPSAX4KIQZSKDW7HB7EHWEJ2Q4EKE4MCYL5REWXVX42QC
/// Iterator over ngrams analogous to those used by scikit-learn's HashingVectorizer
impl TfidfTransformer {
/// Constructs a new `TfidfTransformer`
///
/// # Arguments
///
/// * `smooth_idf` - Adds a 1 to the numerator and denominator of the IDF-term. Prevents zero divisions
pub fn new(smooth_idf: bool) -> Self {
Self {
smooth_idf,
idf: None,
}
}
/// Fit the transformer to the input matrix
pub fn fit(&mut self, matrix: &Array<f64, Ix2>) {
// We need the number of documents to calculate the idf
let mut n_samples = matrix.shape()[0];
// Iterate over every column (Axis 1) of the matrix and count the occurence of each term, meaning +1 for every document where
// the value is != 0
let df_iter = matrix
.axis_iter(Axis(1))
.map(|r| r.fold(0.0, |acc, &cur| if cur != 0.0 { acc + 1. } else { acc }));
let mut idf_vector: Array<f64, _> = ArrayBase::from_iter(df_iter);
println!("idf vector: {:?}", idf_vector);
// Apply smoothing if desired. Acts as if there is one document containing every term
if self.smooth_idf {
idf_vector += 1.;
n_samples += 1;
}
println!("idf vector sm: {:?}", idf_vector);
// Calculate idf from df
idf_vector.mapv_inplace(|df| (n_samples as f64 / df).log10() + 1.0);
self.idf = Some(idf_vector);
}
/// Transform the frequency matrix to a TF-IDF representation
pub fn transform(&self, mut matrix: Array2<f64>) -> Array2<f64> {
for mut row in matrix.axis_iter_mut(Axis(0)) {
azip!((tf in &mut row, &idf in self.idf.as_ref().unwrap()) *tf = *tf * idf);
let norm = row.norm();
row /= norm;
}
matrix
}
pub fn idf(&self) -> Option<&Array1<f64>> {
self.idf.as_ref()
}
}
/// Iterator over n-grams analogous to those used by scikit-learn's HashingVectorizer
assert_eq!(t, a);
assert_eq!(t, r);
}
#[test]
fn tfidf_transformation_test() {
let v = HashingVectorizer::new(16, 5, 5, false);
let mut tf = TfidfTransformer::new(true);
let input = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
];
let t = array![
[
0.,
0.,
-0.3726943,
-0.3726943,
0.30470201,
0.,
0.,
-0.30470201,
-0.30470201,
0.,
0.,
-0.30470201,
-0.46035161,
0.,
-0.3726943,
0.
],
[
0.,
0.3506238,
-0.44759726,
-0.44759726,
0.18297004,
0.27643583,
0.,
0.18297004,
0.18297004,
0.,
-0.22379863,
0.,
-0.27643583,
0.,
-0.22379863,
-0.3506238
],
[
-0.76438624,
0.,
0.,
0.,
0.19944423,
0.30132545,
0.,
-0.19944423,
0.,
0.,
0.,
-0.19944423,
0.,
0.38219312,
-0.24394892,
0.
],
[
0.,
0.,
-0.75564616,
-0.37782308,
0.30889513,
0.,
0.,
-0.30889513,
-0.30889513,
0.,
0.,
0.,
0.,
0.,
0.,
0.
]
];
/*let t = array![[
0.,
0.,
-0.35355339,
-0.35355339,
0.35355339,
0.,
0.,
-0.35355339,
-0.35355339,
0.,
0.,
-0.35355339,
-0.35355339,
0.,
-0.35355339,
0.
]];*/
let r = v.fit_transform(&input);
println!("{:?}", r);
tf.fit(&r);
println!("idf: {:?}", tf.idf());
let r = tf.transform(r);
assert_eq!(t, r);