#include <iostream>
#include <chrono>
#include <algorithm>
#include <cstdio>
#include <string>
void multiply_tr(FT* m1, FT* m2, size_t size, FT* result);
int main(int argc, char** argv)
{
constexpr size_t dim {4};
constexpr size_t size {dim*dim};
int times = argc > 1 ? std::stoi(argv[1]) : 100;
FT m1[size] = {1.5, 2.3, 1.1, 4,
1.2, 2.2, 3.3, 4.4,
3.1, .1, 2.2, .87,
5.5, 0.8, 1.7, 2.2};
FT res[size];
std::fill(res, res + size, 0);
auto s = std::chrono::high_resolution_clock::now();
for (int i {}; i < times; ++i) multiply_tr(m1, m1, dim, res);
auto e = std::chrono::high_resolution_clock::now();
//std::chrono::duration<std::chrono::nanoseconds> t = e - s;
std::printf("Time to complete: %ldns.\n",
std::chrono::duration_cast<std::chrono::nanoseconds>(e - s).count());
for (size_t i {}; i < dim; ++i)
{
for (size_t j {}; j < dim; ++j)
std::printf("%g, ", res[i*dim + j]);
std::puts("");
}
return 0;
}
void multiply_tr(FT* m1, FT* m2, size_t size, FT* result)
{
for (size_t i {}; i < size; ++i)
for (size_t j {}; j < size; ++j)
{
auto pos_row1 = i * size;
auto pos_row2 = j * size;
auto pos_result = i * size + j;
for (size_t k {}; k < size; ++k)
result[pos_result + k] += m1[pos_row1] * m2[pos_row2];
}
}