多线程基准测试问题

2023-12-12

我编写了一个代码，可以随机生成两个尺寸从 2x2 到 50x50 的矩阵。然后，我记录从维度 2 到维度 50 的每个矩阵乘法所需的时间。我将这个时间记录 100 次，以获得每种情况 2 -50 的良好平均值。该程序首先按顺序将矩阵相乘，并将平均执行时间记录在 csv 文件中。然后，它继续使用 pthread 进行并行矩阵乘法，并将平均执行时间记录在单独的 csv 文件中。我的问题是顺序乘法的平均执行时间比并行执行短很多。对于大小为 50 的矩阵，顺序乘法需要 500 微秒，并行乘法需要 2500 微秒。这是由于我如何计时代码而导致的问题吗？或者我的线程实现效果不是很好并且实际上导致代码需要更长的时间来执行？我在生成矩阵后启动计时器，并在所有线程连接在一起后停止计时器。线程代码最初是为两个大小不均匀的矩阵编写的，因此它实现了负载平衡算法。

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <algorithm>
#include  <vector>
#include <stdlib.h>
#include <pthread.h>
#include <cstdlib>
#include <ctime>
#include <sys/time.h>
#include <chrono>
#include <unistd.h>


using namespace std;
int n,i,j,t,k,l,MAX;
float randomnum,sum1, avg;
float matA[100][100];
float matB[100][100];
float matC[100][100];
struct Loading
{
    int r;
    int c;
    int n;
    int m;
};

// threads
pthread_t threads[100] = { 0 };

// indexes
int indexes[100] = {0};

// load balancing
Loading loads[100] = { 0 };

// for printing in thread
pthread_mutex_t M;

// run thread
void* multi(void* arg)
{
    int index = *((int*)(arg));
    Loading load = loads[index];
    int i = 0;
    int j = 0;
    int k = 0;
    int istart = load.r;
    int jstart = load.c;

    pthread_mutex_lock(&M);
     // cout << "thread #" << index << " pid: " << getpid() << " starting " << " row " << istart << " col "  << jstart << endl;
    pthread_mutex_unlock(&M);

    // logic to balance loads amongst threads using for loop
    int n = load.n;
    for (i = istart; i < MAX; i++)
    {

        for (j =jstart;n > 0 && j < MAX; j++,n--)
        {
            for (k = 0; k < MAX; k++)
            {
                matC[i][j] += matA[i][k] * matB[k][j];
            }

            pthread_mutex_lock(&M);
            //cout << "row " << i << " col "<< j << " value " << matC[i][j] << endl;
            pthread_mutex_unlock(&M);
        }

        jstart = 0;

        if (n == 0)
        {
           pthread_mutex_lock(&M);
          // cout << "thread #" << index << " pid: " << getpid() << " has completed " << endl;
           pthread_mutex_unlock(&M);
           return 0;

        }
    }


    return 0;
}

int num_threads = 0;
int MAX_THREADS= 0;




int main()
{

pthread_mutex_init(&M, NULL);


srand ( time(NULL) );

//for (n=2; n<4; n++) {
ofstream myfile;
    //  myfile.open ("/home/gage/Desktop/timing/seqrecord.csv");
myfile.open ("seqrecord.csv");
      myfile << "testtowork\n";

for (n=2; n<50; n++){
    MAX =n;
    myfile << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
for(t=0; t<101; t++){
 //clock_t startTime = clock();
auto start = chrono::steady_clock::now();
  for (i = 0; i < MAX; ++i)
for (j = 0; j < MAX; ++j)
for (k = 0; k < MAX; ++k)
{
matC[i][j] += matA[i][k] * matB[k][j];
}


//int stop_s=clock();
auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();

}
avg = sum1 / 100;
myfile << "Average execution" << "," << avg << "\n";
sum1 =0;
avg = 0;


// }
}

myfile.close();
ofstream myfile1;

myfile1.open ("parallel.csv");
      myfile1 << "testtowork\n";





for (n=2; n<51; n++)
{
MAX = n;
MAX_THREADS = n*n;
num_threads =n;
 myfile1 << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
           for(t=0; t<101; t++){
         //clock_t startTime = clock();
            auto start = chrono::steady_clock::now();
         // calculade load balancing

        // cout << "calculation load balancing" << endl;

            double nwhole = (double)MAX_THREADS / num_threads;
            double last = 0;
            double sum = 0;
            int k = 0;

            loads[k].r = 0;
            loads[k].c = 0;
            loads[k].n = 0;


            while (k < num_threads)
            {

                sum = sum + nwhole;

                loads[k].n = (int)sum - (int)last;

                // check last length
                if(k == num_threads-1 && sum != MAX_THREADS)
                {
                    sum=MAX_THREADS;
                    loads[k].n=(int)sum - (int)last;
                }

                // display result
              //  cout << (int)last << " to " << (int)sum << " length: " << (int)sum - int(last) << endl;


                k++;

                if(k < num_threads)
                {
                loads[k].r = ((int)sum) / MAX;
                loads[k].c = ((int)sum) % MAX;
                }


                last = sum;


            }




        //cout << "making threads" << endl;

        void* exit_status;

        int rc;

        for( i = 0; i < num_threads ; i++ ) {
         //     cout << "main() : creating thread, " << i << endl;
              indexes[i] = i;
              rc = pthread_create(&threads[i], NULL, multi, (void *)&indexes[i]);



              if (rc) {
           //      cout << "Error:unable to create thread," << rc << endl;
                 exit(-1);
              }
           }

        // wait for threads to end
        for (j = 0; j < num_threads; j++)
        {

            pthread_join(threads[j], &exit_status);
        }

auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile1 << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();
 }
 avg = sum1 / 100;
myfile1 << "Average" << "," << avg << "\n";
sum1 =0;
avg = 0;



       }


return 0;
}

我最近刚刚写了一个类似问题的答案SO：具有 C++11 多线程的 Eigen 库.

由于我也对这个主题感兴趣并且手头已经有了工作代码，因此我将该示例改编为 OP 的矩阵乘法任务：

test-multi-threading-matrix.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  const size_t NThreads = std::thread::hardware_concurrency();
  const size_t nThreads = std::min(dim * dim, NThreads);
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results4[4] = {
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - stupid aproach
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t k = 0, n = dim * dim; k < n;) {
        size_t nT = 0;
        for (; nT < nThreads && k < n; ++nT, ++k) {
          const size_t i = k / dim, j = k % dim;
          threads[nT] = std::move(std::thread(
            [i, j, &results, &a, &b]() {
              results[i][j] = dot(a, i, b, j);
            }));
        }
        for (size_t iT = 0; iT < nT; ++iT) threads[iT].join();
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - interleaved
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[2];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (Value iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            for (size_t k = iT, n = dim * dim; k < n; k += nThreads) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - grouped
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[3];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            const size_t n = dim * dim;
            for (size_t k = iT * n / nThreads, kN = (iT + 1) * n / nThreads;
              k < kN; ++k) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results4 / sizeof *results4;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results4[0][i][j] != results4[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results4[0][" << i << "]["  << j << "]: "
            << results4[0][i][j]
            << " != results4[" << iResult << "][" << i << "][" << j << "]: "
            << results4[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results4[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  std::cout << "std::thread::hardware_concurrency(): "
    << std::thread::hardware_concurrency() << '\n';
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "Single"
      << " | " << std::setw(10) << "Multi 1"
      << " | " << std::setw(10) << "Multi 2"
      << " | " << std::setw(10) << "Multi 3"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

在我的第一次测试中，我从 2×2 矩阵开始，并将以 64×64 矩阵结尾的每个测试系列的行数和列数加倍。

我很快得出了同样的结论Mike：这些矩阵太小了。设置和加入线程的开销会消耗并发性可能获得的任何加速。因此，我修改了从 64×64 矩阵开始到 512×512 结束的测试系列。

我编译并运行cygwin64（在 Windows 10 上）：

$ g++ --version
g++ (GCC) 7.3.0

$ g++ -std=c++17 -O2 test-multi-threading-matrix.cc -o test-multi-threading-matrix

$ ./test-multi-threading-matrix
std::thread::hardware_concurrency(): 8
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |        417 |     482837 |       1068 |       1080
 2. |        403 |     486775 |       1034 |       1225
 3. |        289 |     482578 |       1478 |       1151
 4. |        282 |     502703 |       1103 |       1081
 5. |        398 |     495351 |       1287 |       1124
 6. |        404 |     501426 |       1050 |       1017
 7. |        402 |     483517 |       1000 |        980
 8. |        271 |     498591 |       1092 |       1047
 9. |        284 |     494732 |        984 |       1057
10. |        288 |     494738 |       1050 |       1116
Average Values:
    |      343.8 |   492324.8 |     1114.6 |     1087.8
Ratio:
    |      1.000 |   1432.009 |      3.242 |      3.164

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |       2282 |    1995527 |       2215 |       1574
 2. |       3076 |    1954316 |       1644 |       1679
 3. |       2952 |    1981908 |       2572 |       2250
 4. |       2119 |    1986365 |       1568 |       1462
 5. |       2676 |    2212344 |       1615 |       1657
 6. |       2396 |    1981545 |       1776 |       1593
 7. |       2513 |    1983718 |       1950 |       1580
 8. |       2614 |    1852414 |       1737 |       1670
 9. |       2148 |    1955587 |       1805 |       1609
10. |       2161 |    1980772 |       1794 |       1826
Average Values:
    |     2493.7 |  1988449.6 |     1867.6 |     1690.0
Ratio:
    |      1.000 |    797.389 |      0.749 |      0.678

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |      32418 |    7992363 |      11753 |      11712
 2. |      32723 |    7961725 |      12342 |      12490
 3. |      32150 |    8041516 |      14646 |      12304
 4. |      30207 |    7810907 |      11512 |      11992
 5. |      30108 |    8005317 |      12853 |      12850
 6. |      32665 |    8064963 |      13197 |      13386
 7. |      36286 |    8825215 |      14381 |      15636
 8. |      35068 |    8015930 |      16954 |      12287
 9. |      30673 |    7973273 |      12061 |      13677
10. |      36323 |    7861856 |      14223 |      13510
Average Values:
    |    32862.1 |  8055306.5 |    13392.2 |    12984.4
Ratio:
    |      1.000 |    245.125 |      0.408 |      0.395

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |     404459 |   32803878 |     107078 |     103493
 2. |     289870 |   32482887 |      98244 |     103338
 3. |     333695 |   29398109 |      87735 |      77531
 4. |     236028 |   27286537 |      81620 |      76085
 5. |     254294 |   27418963 |      89191 |      76760
 6. |     230662 |   27278077 |      78454 |      84063
 7. |     274278 |   27180899 |      74828 |      83829
 8. |     292294 |   29942221 |     106133 |     103450
 9. |     292091 |   33011277 |     100545 |      96935
10. |     401007 |   33502134 |      98230 |      95592
Average Values:
    |   300867.8 | 30030498.2 |    92205.8 |    90107.6
Ratio:
    |      1.000 |     99.813 |      0.306 |      0.299

我对 VS2013（发布模式）做了同样的事情并得到了类似的结果。

3 的加速听起来并没有那么糟糕（忽略它与 8 相距甚远的事实，您可能认为 8 是 H/W 并发性 8 的理想选择）。

在摆弄矩阵乘法时，我得到了一个优化的想法，我也想检查它——甚至超越多线程。这是改进缓存局部性的尝试。

For this, I transpose the 2^nd matrix before multiplication. For the multiplication, a modified version of dot() (dotT()) is used which considers the transposition of 2^nd matrix respectively.

我分别修改了上面的示例代码，得到了test-single-threading-matrix-transpose.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

template <typename VALUE>
MatrixT<VALUE> transpose(const MatrixT<VALUE> mat)
{
  MatrixT<VALUE> matT(mat.getNumCols(), mat.getNumRows());
  for (size_t i = 0; i < mat.getNumRows(); ++i) {
    for (size_t j = 0; j < mat.getNumCols(); ++j) {
      matT[j][i] = mat[i][j];
    }
  }
  return matT;
}

template <typename VALUE>
VALUE dotT(const MatrixT<VALUE> &mat1, size_t iRow1, const MatrixT<VALUE> &matT2, size_t iRow2)
{
  const size_t n = mat1.getNumCols();
  assert(n == matT2.getNumCols());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow1][i] * matT2[iRow2][i];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results2[2] = {
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // single threading - with transposed matrix
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      const Matrix bT = transpose(b);
      // do experiment single-threaded with transposed B
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dotT(a, i, bT, j);
      }
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results2 / sizeof *results2;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results2[0][i][j] != results2[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results2[0][" << i << "]["  << j << "]: "
            << results2[0][i][j]
            << " != results2[" << iResult << "][" << i << "][" << j << "]: "
            << results2[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results2[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "A * B"
      << " | " << std::setw(10) << "A *T B^T"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

我编译并再次运行cygwin64（在 Windows 10 上）：

$ g++ -std=c++17 -O2 test-single-threading-matrix-transpose.cc -o test-single-threading-matrix-transpose && ./test-single-threading-matrix-transpose
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |        394 |        366
 2. |        394 |        368
 3. |        396 |        367
 4. |        382 |        368
 5. |        392 |        289
 6. |        297 |        343
 7. |        360 |        341
 8. |        399 |        358
 9. |        385 |        354
10. |        406 |        374
Average Values:
    |      380.5 |      352.8
Ratio:
    |      1.000 |      0.927

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |       2972 |       2558
 2. |       3317 |       2556
 3. |       3279 |       2689
 4. |       2952 |       2213
 5. |       2745 |       2668
 6. |       2981 |       2457
 7. |       2164 |       2274
 8. |       2634 |       2106
 9. |       2126 |       2389
10. |       3015 |       2477
Average Values:
    |     2818.5 |     2438.7
Ratio:
    |      1.000 |      0.865

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |      31312 |      17656
 2. |      29249 |      17127
 3. |      32127 |      16865
 4. |      29655 |      17287
 5. |      32137 |      17687
 6. |      29788 |      16732
 7. |      32251 |      16549
 8. |      32272 |      16257
 9. |      28019 |      18042
10. |      30334 |      17936
Average Values:
    |    30714.4 |    17213.8
Ratio:
    |      1.000 |      0.560

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |     322005 |     135102
 2. |     310180 |     134897
 3. |     329994 |     134304
 4. |     335375 |     137701
 5. |     330754 |     134252
 6. |     353761 |     136732
 7. |     359234 |     135632
 8. |     351498 |     134389
 9. |     360754 |     135751
10. |     368602 |     137139
Average Values:
    |   342215.7 |   135589.9
Ratio:
    |      1.000 |      0.396

令人印象深刻，不是吗？

它实现了与上述多线程尝试类似的加速（我的意思是更好的），但使用单核。

The additional effort for transposing the 2^nd matrix (which is considered in measurement) does more than amortize. This isn't that surprising because there a so many more read-accesses in multiplication (which now access consecutive bytes) compared to the additional effort to construct/write the transposed matrix once.

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

多线程基准测试问题的相关文章

未提供参数时如何指定 C# System.Commandline 行为？

在我的控制台应用程序中当未提供控制台参数时将执行我指定列表在本例中为参数 3 的任何处理程序调用该处理程序时布尔参数设置为 false 但对我来说根本不调用它更有意义如何防止这种情况发生并显示帮助文本 using System
如何将 protobuf-net 与不可变值类型一起使用？

假设我有一个像这样的不可变值类型 Serializable DataContract public struct MyValueType ISerializable private readonly int x private readon
计算 Richtextbox 中所有单词的最有效方法是什么？

我正在编写一个文本编辑器需要提供实时字数统计现在我正在使用这个扩展方法 public static int WordCount this string s s s TrimEnd if String IsNullOrEmpty s re
MVC 在布局代码之前执行视图代码并破坏我的脚本顺序

我正在尝试将所有 javascript 包含内容移至页面底部我正在将 MVC 与 Razor 一起使用我编写了一个辅助方法来注册脚本它按注册顺序保留脚本并排除重复的内容 Html RegisterScript scripts som
在 C 中匹配二进制模式

我目前正在开发一个 C 程序需要解析一些定制的数据结构幸运的是我知道它们是如何构造的但是我不确定如何在 C 中实现我的解析器每个结构的长度都是 32 位并且每个结构都可以通过其二进制签名来识别举个例子有两个我感兴趣的特定结构
将 Word 文档另存为图像

我正在使用下面的代码将 Word 文档转换为图像文件但是图片显得太大内容不适合有没有办法渲染图片或将图片保存到合适的尺寸 private void btnConvert Click object sender EventArgs e
qdbusxml2cpp 未知类型

在使用 qdbusxml2cpp 程序将以下 xml 转换为 Qt 类时我收到此错误 qdbusxml2cpp c ObjectManager a ObjectManager ObjectManager cpp xml object ma
从 Linux 内核模块中调用用户空间函数

我正在编写一个简单的 Linux 字符设备驱动程序以通过 I O 端口将数据输出到硬件我有一个执行浮点运算的函数来计算硬件的正确输出不幸的是这意味着我需要将此函数保留在用户空间中因为 Linux 内核不能很好地处理浮点运算这是设
在一个平台上，对于所有数据类型，所有数据指针的大小是否相同？ [复制]

这个问题在这里已经有答案了 Are char int long 甚至long long 大小相同在给定平台上不能保证它们的大小相同尽管在我有使用经验的平台上它们通常是相同的 C 2011 在线草稿 http www open std
我可以使用 moq Mock 来模拟类而不是接口吗？

正在经历https github com Moq moq4 wiki Quickstart https github com Moq moq4 wiki Quickstart 我看到它 Mock 一个接口我的遗留代码中有一个没有接口的类
使用自定义堆的类似 malloc 的函数

如果我希望使用自定义预分配堆构造类似 malloc 的功能那么 C 中最好的方法是什么我的具体问题是我有一个可映射类似内存的设备已将其放入我的地址空间中但我需要获得一种更灵活的方式来使用该内存来存储将随着时间的推移分配和释放的
使用

因此我决定开始使用 C 进行编程我所做的一件事就是创建一个 pausec exe pause exe 克隆它有效但是当像这样调用它时 lt nul pausec 它崩溃了据我所知我得到的错误是这样的未处理的异常 System

等待进程释放文件

我如何等待文件空闲以便ss Save 可以用新的覆盖它吗如果我紧密地运行两次左右我会得到一个generic GDI error
“接口”类似于 boost::bind 的语义

我希望能够将 Java 的接口语义与 C 结合起来起初我用过boost signal为给定事件回调显式注册的成员函数这非常有效但后来我发现一些函数回调池是相关的因此将它们抽象出来并立即注册所有实例的相关回调是有意义的但我了解到的
使用管道时，如果子进程数量大于处理器数量，进程是否会被阻塞？

当子进程数量很大时我的程序停止运行我不知道问题是什么但我猜子进程在运行时以某种方式被阻止下面是该程序的主要工作流程 void function int process num int i initial variables for
动态添加 ASP.Net 控件

我有一个存储过程它根据数据库中存储的记录数返回多行现在我想有一种方法来创建 div 带有包含该行值的控件的标记如果从数据库返回 10 行则 10 div 必须创建标签我有下面的代码来从数据库中获取结果但我不知道如何从这里继续 S
方法优化 - C#

我开发了一种方法允许我通过参数传入表字符串列数组字符串和值数组对象然后使用这些参数创建参数化查询虽然它工作得很好但代码的长度以及多个 for 循环散发出一种代码味道特别是我觉得我用来在列和值之间插入逗号的方法可以用不同的
System.IO.FileNotFoundException：找不到网络路径。在 Windows 7 上使用 DirectoryEntry 对象时出现异常

我正在尝试使用 DirectoryEntry 对象连接到远程 Windows 7 计算机这是我的代码 DirectoryEntry obDirEntry new DirectoryEntry WinNT hostName hostName
WebSocket安全连接自签名证书

目标是一个与用户电脑上安装的 C 应用程序交换信息的 Web 应用程序客户端应用程序是 websocket 服务器浏览器是 websocket 客户端最后用户浏览器中的 websocket 客户端通过 Angular 持久创建并且
使用 .NET Process.Start 运行时挂起进程 - 出了什么问题？

我在 svn exe 周围编写了一个快速而肮脏的包装器来检索一些内容并对其执行某些操作但对于某些输入它偶尔会重复挂起并且无法完成例如一个调用是 svn list svn list http myserver 84 svn Docum

随机推荐

如何将 Android 中的大型 pdf 文件加载到我的 web 视图中

我无法将大型 pdf 文件有 900 页加载到我的在我的 Android 应用程序中的 webview 我尝试了这段代码并且在任何其他 pdf 但是当我尝试打开一个大的 pdf 时它显示无预览可用的 wvReport getS
从单独的脚本更新 WebExtension webRequest.onBeforeRequest 侦听器 URL 设置

我目前正在创建一个 WebExtension 其中我在发出的 Web 请求上注册一个侦听器如下所示 main js chrome webRequest onBeforeRequest addListener main function u
matlab脚本编辑

我想在两种情况下编辑 matlab 脚本 1 在linux终端运行的matlab命令窗口中如何创建和编辑脚本文件使用 edit filename 将调用交互式编辑器这在 Linux 服务器上运行时是不需要的 2 在emacs中有没有
在 JavaScript 中设置剃刀变量

我正在尝试从 JavaScript 函数内将表单上的组合框的内容分配给剃刀变量就像这样
控制window.onbeforeunload事件

出于好奇我可以控制 window onbeforeunload 事件例如检查用户是否决定离开页面或留在其中我可以根据他的决定发出警报或某些功能吗如果是请告诉我我知道 window onbeforeunload 是一个事件向用户
脚本标签 - 异步和延迟

我有几个关于属性的问题async defer为了
使用 Jquery 和 jgestures 在 Cordova 中滑动

当涉及到使用 jQuery jQuery mobile 进行滑动时 Stackoverflow 中有很多主题然而它们似乎都没有按照我想要的方式工作以下是我的phonegap应用程序的索引页面的结构作为this主题推荐我尝试了jge
如何统计每个病人预约了多少名医生？

我需要的结果如下 PatientID Doctors Patient1 3 Patient2 2 Patient3 1 预订的桌子是这样的 GPS Table PatientID DoctorID DATE Patient1 Doctor1
如何从给定数据库中删除所有带有前缀“bkp”的表？

我有一个 SQL Server 2005 在该服务器中我有 3 个数据库 gt a b c 如果我想删除表仅来自数据库 c 的表表的名称应以 bkp 开头表应在一天前创建尝试这个 USE C GO SELECT DROP TABLE
我需要在 mysql 字段中查找并替换 \n

我有这样的数据 1 街 n2 街 nmycity nmytown 我想要做的是将 n 替换为 char 10 因为我需要在 db 字段中进行真正的换行符我有 UPDATE data set value REPLACE value n ch
缓存阻塞实际上如何提高性能？

我正在阅读有关缓存阻塞的内容这个英特尔页面 It says 阻塞是一种众所周知的优化技术可以帮助避免许多应用程序中的内存带宽瓶颈阻塞背后的关键思想是通过确保数据在多次使用时保留在缓存中来利用应用程序中可用的固有数据重用提供一个例子 f
我可以在 Intel Mac 上为 M1 Mac 构建 PyInstaller 二进制文件吗？

I have recently found out that PyInstaller now works on macOS Monterey so I tried to build a binary When testing on my I
RXTX 串行连接 - 阻塞 read() 的问题

我正在尝试使用 RXTX 库来阻止 Windows XP 和 7 上的串行通信我已经测试了两端与超级终端的连接它工作完美我使用以下代码设置了连接为了清楚起见省略了异常处理和防御检查 private InputStream inSt
按因子将比例列添加到数据框

我正在尝试将一列添加到由按因子标准化的值组成的数据框中例如 data frame 261 obs of 3 variables Area Factor w 29 levels Antrim Ards 1 1 1 1 1 1 1 1 1 2
matplotlib：颜色条及其文本标签

我想创建一个colorbar传说heatmap 使得标签位于每个离散颜色的中心从这里借来的例子 import matplotlib pyplot as plt import numpy as np from matplotlib colo
如何将 HTML 文件加载到画布中？

我知道我们可以将图像加载到画布中但我想知道我们是否能够将简单的 HTML 文件加载到画布中如果是怎么办 Thanks Short answer No you cannot 长答案不可靠但是是的你可以通过某些可能是黑客的方式
面向对象的d3

我有想要添加到 SVG 的数据对象考虑以下伪片段 var data counter 0 for var col 1 col lt 5 col for var row 1 row lt 3 row data push id obj coun
为什么 django 的 prefetch_lated() 只能与 all() 一起使用，而不能与 filter() 一起使用？

假设我有这个模型 class PhotoAlbum models Model title models CharField max length 128 author models CharField max length 128 clas
如何将本地 DynamoDB 数据库移动到 AWS 云？

我有一组静态数据想要输入 AWS DynamoDB 我已经下载了本地版本的 DynamoDB 并测试了在其上生成数据的代码现在我在本地拥有包含所有数据的数据库我的问题是有没有一种有效的方法将本地数据库迁移到云端我知道我可以将 CSV
多线程基准测试问题

我编写了一个代码可以随机生成两个尺寸从 2x2 到 50x50 的矩阵然后我记录从维度 2 到维度 50 的每个矩阵乘法所需的时间我将这个时间记录 100 次以获得每种情况 2 50 的良好平均值该程序首先按顺序将矩阵相乘并将

多线程基准测试问题

多线程基准测试问题 的相关文章

随机推荐

热门标签

多线程基准测试问题的相关文章