dji_droneid/gnuradio/gr-droneid/lib/normalized_xcorr_estimate_i...

/* -*- c++ -*- */
/*
 * Copyright 2022 gr-droneid author.
 *
 * SPDX-License-Identifier: GPL-3.0-or-later
 */

#include "normalized_xcorr_estimate_impl.h"
#include <gnuradio/io_signature.h>

#include <volk/volk.h>
#include <numeric>
#include <gnuradio/droneid/misc_utils.h>

namespace gr {
namespace droneid {

normalized_xcorr_estimate::sptr
normalized_xcorr_estimate::make(const std::vector<gr_complex> &taps) {
    return gnuradio::get_initial_sptr
        (new normalized_xcorr_estimate_impl(taps));
}


/*
         * The private constructor
 */
normalized_xcorr_estimate_impl::normalized_xcorr_estimate_impl(const std::vector<gr_complex> &taps)
    : gr::block("dot_prod",
                gr::io_signature::make(1, 1, sizeof(gr_complex)),
                gr::io_signature::make(1, 1, sizeof(gr_complex))),
      taps_(taps), window_size_(taps.size()) {

    // Remove the mean from the taps, conjugate the taps, and calculate the variance ahead of time
    const auto mean =
        std::accumulate(taps_.begin(), taps_.end(), gr_complex{0, 0}) / static_cast<float>(taps_.size());

    for (auto & tap : taps_) {
        tap = std::conj(tap) - mean;
    }

    taps_var_ = misc_utils::var_no_mean(&taps_[0], taps_.size());

    // Create some constants to enable the use of multiplies instead of divides later
    window_size_recip_ = 1.0f / static_cast<float>(window_size_);
    window_size_recip_complex_ = gr_complex{window_size_recip_, 0};
}

/*
         * Our virtual destructor.
 */
normalized_xcorr_estimate_impl::~normalized_xcorr_estimate_impl() {
}

int
normalized_xcorr_estimate_impl::general_work(int noutput_items,
                                             gr_vector_int &ninput_items,
                                             gr_vector_const_void_star &input_items,
                                             gr_vector_void_star &output_items) {
    // Get handles to the input and output arrays
    const auto *in = (const gr_complex *) input_items[0];
    auto *out = (gr_complex *) output_items[0];

    // Always tell GNU Radio that all samples were accepted even if not this many samples were written out
    consume_each(noutput_items);

    // This is how the remaining samples are buffered between calls.  It's important to realize that this algo
    // needs <window_size> samples to be able to produce one output value.  This means that there will always
    // be unused samples at the end of each function call that need to be held onto until the next call.  The
    // hope was that set_history() took care of this, but it does not.  So, the remaining samples from the last
    // call are stored in <buffer_>.  The <in> buffer can't hold more samples (it's not known how many samples
    // wide the buffer is) so in order to use the old samples without jumping through very slow hoops, the new
    // samples are appended to the old samples.
    buffer_.insert(buffer_.end(), in, in + noutput_items);

    // Exit early if there aren't enough samples to process.
    if (buffer_.size() < window_size_) {
        return 0;
    }

    // Figure out how many windows worth of data can be processed.  It's possible that this specific call
    // doesn't have enough storage in its output buffer to hold all the samples that could be processed.  For
    // this reason the min of the available output buffer space and number of windows that could be processed
    // must be used.
    const auto num_steps = std::min(static_cast<uint64_t>(noutput_items), buffer_.size() - window_size_);

    // Resize the buffers as needed
    if (sums_.size() < num_steps) {
        sums_.resize(num_steps);
        abs_squared_.resize(num_steps + window_size_);
        vars_.resize(num_steps);
    }

    // TODO(24June2022): There are <window_size-1> extra operations happening on each call.  This comes from the
    //                   fact that some of these computations are being done on samples that are going to be
    //                   used again on the next function call.  Would be a good idea to buffer the abs squared
    //                   and maybe the running variance average.

    // What is happening below is roughly the following:
    //
    // for idx = 1:length(buffer_) - window_size_
    //   window = buffer_(idx:idx + window_size_ - 1);
    //   variance = sum(abs(window).^2) / window_size_;
    //   dot_prod = sum(window .* taps_) / window_size_;
    //   out(idx) = dot_prod / sqrt(variance * taps_var_);
    // end
    //
    // But the variance is calculated as a running sum.  The first variance has to be calculated the hard way,
    // and then every iteration of the loop will subtract off the left-most element of the window that just
    // dropped off, and adds on the new right-most element in the window.
    //
    // Doing this calculation of the first element outside the loop prevents needing a conditional in the
    // critical section

    // Calculate the first variance the hard way
    volk_32fc_magnitude_squared_32f(&abs_squared_[0], &buffer_[0], num_steps + window_size_);
    auto running_var = std::accumulate(abs_squared_.begin(), abs_squared_.begin() + window_size_, 0.f);
    vars_[0] = running_var;

    // Calculate the first dot product
    volk_32fc_x2_dot_prod_32fc(&out[0], &buffer_[0], &taps_[0], window_size_);

    // Calculate the running abs value sum and dot product for the remaining samples
    for (uint32_t idx = 1; idx < num_steps; idx++) {
        // sum(abs(window).^2)
        running_var = running_var - abs_squared_[idx - 1] + abs_squared_[idx + window_size_];
        vars_[idx] = running_var;

        // Compute tue dot product of the current window and the filter taps
        // sum(window .* taps_)
        volk_32fc_x2_dot_prod_32fc(&out[idx], &buffer_[idx], &taps_[0], window_size_);
    }

    // Scale the dot product down
    volk_32fc_s32fc_multiply_32fc(&out[0], &out[0], window_size_recip_complex_, num_steps);

    // Scale the variance sums down
    volk_32f_s32f_multiply_32f(&vars_[0], &vars_[0], window_size_recip_, num_steps);

    // Multiply each variance by the tap variances then take the reciprocal
    volk_32f_s32f_multiply_32f(&vars_[0], &vars_[0], taps_var_, num_steps);

    // Calculate the inverse square root (1/sqrt(vars_[x]))
    volk_32f_invsqrt_32f(&vars_[0], &vars_[0], num_steps);

    // Divide by the square root above
    volk_32fc_32f_multiply_32fc(&out[0], &out[0], &vars_[0], num_steps);

    // Go through all outputs and replace NaN's with zeros.  This isn't strictly required, but nice to have
    for (uint32_t idx = 0; idx < num_steps; idx++) {
        if (out[idx].real() == FP_NAN || out[idx].imag() == FP_NAN) {
            out[idx] = zero_complex_;
        }
    }

    // Remove all the samples that have been processed from the buffer.  Leaving just the last <window_size_-1>
    // samples for the next call
    buffer_.erase(buffer_.begin(), buffer_.begin() + num_steps);

    // Tell runtime system how many output items we produced.
    return num_steps;
}

} /* namespace droneid */
} /* namespace gr */