【c/c++】AVX2を使った行列積を計算するプログラム

講義で作ったOpenMPとAVX2命令を使って4096x4096の行列積を計算するプログラムです。そのまま並列化しろという内容だったので最適化はちゃんとしていませんが、折角作ったので公開しておきます。

ソースコード

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <omp.h>
#include <immintrin.h>

#define SIZE 4096

void MatrixMul_1_ST(double *A, double *B, double *C) { //シングルスレッドSIMD無し
    int i,j,k;
    double sum;
    double tmp;

    for (i=0;i<SIZE;i++){
        for (j=i+1;j<SIZE;j++){
            tmp         = B[j*SIZE+i];
            B[j*SIZE+i] = B[i*SIZE+j];
            B[i*SIZE+j] = tmp;
        }
    }

    for (i=0; i<SIZE; i++){
        for (j=0; j<SIZE; j++){
            sum = 0.0;
            for (k=0; k<SIZE; k++)
                sum += A[i*SIZE+k]*B[j*SIZE+k];
            C[i*SIZE+j] = sum;
        }
    }
}

void MatrixMul_1_MT(double *A, double *B, double *C) { //マルチスレッドSIMD無し
    int i,j,k;
    double sum;
    double tmp;

    #pragma omp parallel for private(j, tmp)
    for (i=0;i<SIZE;i++){
        for (j=i+1;j<SIZE;j++){
            tmp         = B[j*SIZE+i];
            B[j*SIZE+i] = B[i*SIZE+j];
            B[i*SIZE+j] = tmp;
        }
    }

    #pragma omp parallel for private(j, k, sum)
    for (i=0; i<SIZE; i++){
        for (j=0; j<SIZE; j++){
            sum = 0.0;
            for (k=0; k<SIZE; k++) sum += A[i*SIZE+k]*B[j*SIZE+k];
            C[i*SIZE+j] = sum;
        }
    }
}

void MatrixMul_1_ST_SIMD(double *A, double *B, double *C) { //シングルスレッドSIMD有り
    int i,j,k;
    double tmp;
    __m256d vsum;
    double *sum = (double*)(&vsum);

    for (i=0;i<SIZE;i++){
        for (j=i+1;j<SIZE;j++){
            tmp         = B[j*SIZE+i];
            B[j*SIZE+i] = B[i*SIZE+j];
            B[i*SIZE+j] = tmp;
        }
    }

    for (i=0; i<SIZE; i++){
        for (j=0; j<SIZE; j++){
            vsum = _mm256_set1_pd(0.0);
            for (k=0; k<SIZE; k+=4){
                vsum = _mm256_add_pd(vsum, _mm256_mul_pd(
                        _mm256_load_pd(&A[i*SIZE+k]),
                        _mm256_load_pd(&B[j*SIZE+k])));
            }
            C[i*SIZE+j] = sum[0] + sum[1] + sum[2] + sum[3];
        }
    }
}

void MatrixMul_1_MT_SIMD(double *A, double *B, double *C) { //マルチスレッドSIMD有り
    int i,j,k;
    double tmp;
    __m256d vsum;
    double *sum;

    #pragma omp parallel for private(j, tmp)
    for (i=0;i<SIZE;i++){
        for (j=i+1;j<SIZE;j++){
            tmp         = B[j*SIZE+i];
            B[j*SIZE+i] = B[i*SIZE+j];
            B[i*SIZE+j] = tmp;
        }
    }
    #pragma omp parallel for private(j, k, vsum, sum)
    for (i=0; i<SIZE; i++){
        sum = (double*)(&vsum);
        for (j=0; j<SIZE; j++){
            vsum = _mm256_set1_pd(0.0);
            for (k=0; k<SIZE; k+=4){ //sizeが4指定なので後処理はしない
                vsum = _mm256_add_pd(vsum, _mm256_mul_pd(
                        _mm256_load_pd(&A[i*SIZE+k]),
                        _mm256_load_pd(&B[j*SIZE+k])));
            }
            C[i*SIZE+j] = sum[0] + sum[1] + sum[2] + sum[3];
        }
    }
}

void Measurement_1(char* Title, void (*MatrixMul_1)(double*, double*, double*)){
    int i,j;
    double *a,*b,*c;
    clock_t ts,te;
    clock_t d1,d2,d3;

    printf("//---------------------\n");
    printf(Title);
    printf("Matrix Size: %d X %d.\n",SIZE,SIZE);

    ts = clock();
//---------------------
    a = (double*)malloc(SIZE*SIZE*sizeof(double));
    b = (double*)malloc(SIZE*SIZE*sizeof(double));
    c = (double*)malloc(SIZE*SIZE*sizeof(double));
    d1 = clock();
//---------------------
    srand(10);

    for(i=0; i<SIZE; i++){
        for(j=0; j<SIZE; j++){
            a[i*SIZE+j] = rand()/(double)RAND_MAX;
            b[i*SIZE+j] = rand()/(double)RAND_MAX;
            c[i*SIZE+j] = 0.0;
        }
    }
    d2 = clock();
//---------------------
    MatrixMul_1(a,b,c);
    d3 = clock();
//---------------------
    printf("Result:\n");
    printf("c[0][0]= %.15e ... c[%d][0]=%.15e\n",c[0],SIZE-1,c[(SIZE-1)*SIZE+0]);
    printf("c[0][1]= %.15e ... c[%d][1]=%.15e\n",c[1],SIZE-1,c[(SIZE-1)*SIZE+1]);
    printf("c[0][2]= %.15e ... c[%d][2]=%.15e\n",c[2],SIZE-1,c[(SIZE-1)*SIZE+2]);
    printf("c[0][3]= %.15e ... c[%d][3]=%.15e\n",c[3],SIZE-1,c[(SIZE-1)*SIZE+3]);
    printf(".......\n");
    printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-4,c[SIZE-4],SIZE-1,SIZE-4,c[(SIZE-1)*SIZE+SIZE-4]);
    printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-3,c[SIZE-3],SIZE-1,SIZE-3,c[(SIZE-1)*SIZE+SIZE-3]);
    printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-2,c[SIZE-2],SIZE-1,SIZE-2,c[(SIZE-1)*SIZE+SIZE-2]);
    printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-1,c[SIZE-1],SIZE-1,SIZE-1,c[(SIZE-1)*SIZE+SIZE-1]);

    te = clock();

    printf("\nTotal Time : %f [sec]\n", (float)(te-ts)/CLOCKS_PER_SEC);
    printf("Time of Allocation           : %f [sec]\n", (float)(d1-ts)/CLOCKS_PER_SEC);
    printf("Time of Initialization       : %f [sec]\n", (float)(d2-d1)/CLOCKS_PER_SEC);
    printf("Time of Operation for Matrix : %f [sec]\n\n", (float)(d3-d2)/CLOCKS_PER_SEC);
//---------------------
    free(a);
    free(b);
    free(c);
//---------------------
}

int main() {
    Measurement_1("Serial Program (1d ver.)\n", *MatrixMul_1_ST);
    Measurement_1("Parallel Program (1d ver.)\n", *MatrixMul_1_MT);
    Measurement_1("Serial SIMD Program (1d ver.)\n", *MatrixMul_1_ST_SIMD);
    Measurement_1("Parallel SIMD Program (1d ver.)\n", *MatrixMul_1_MT_SIMD);

    return 0;
}

実行結果

//---------------------
Serial Program (1d ver.)
Matrix Size: 4096 X 4096.
Result:
c[0][0]= 1.037951225539265e+03 ... c[4095][0]=1.026167362779490e+03
c[0][1]= 1.031621032790639e+03 ... c[4095][1]=1.028450900218210e+03
c[0][2]= 1.034022283576758e+03 ... c[4095][2]=1.028215220977095e+03
c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558352e+03
.......
c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03
c[0][4093]= 1.034889413448711e+03 ... c[4095][4093]=1.025519289347927e+03
c[0][4094]= 1.038085640812733e+03 ... c[4095][4094]=1.031131779777061e+03
c[0][4095]= 1.033145491256165e+03 ... c[4095][4095]=1.023042270704371e+03

Total Time : 77.487999 [sec]
Time of Allocation           : 0.001000 [sec]
Time of Initialization       : 0.586000 [sec]
Time of Operation for Matrix : 76.901001 [sec]

//---------------------
Parallel Program (1d ver.)
Matrix Size: 4096 X 4096.
Result:
c[0][0]= 1.037951225539265e+03 ... c[4095][0]=1.026167362779490e+03
c[0][1]= 1.031621032790639e+03 ... c[4095][1]=1.028450900218210e+03
c[0][2]= 1.034022283576758e+03 ... c[4095][2]=1.028215220977095e+03
c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558352e+03
.......
c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03
c[0][4093]= 1.034889413448711e+03 ... c[4095][4093]=1.025519289347927e+03
c[0][4094]= 1.038085640812733e+03 ... c[4095][4094]=1.031131779777061e+03
c[0][4095]= 1.033145491256165e+03 ... c[4095][4095]=1.023042270704371e+03

Total Time : 12.139000 [sec]
Time of Allocation           : 0.001000 [sec]
Time of Initialization       : 0.575000 [sec]
Time of Operation for Matrix : 11.563000 [sec]

//---------------------
Serial SIMD Program (1d ver.)
Matrix Size: 4096 X 4096.
Result:
c[0][0]= 1.037951225539264e+03 ... c[4095][0]=1.026167362779491e+03
c[0][1]= 1.031621032790638e+03 ... c[4095][1]=1.028450900218213e+03
c[0][2]= 1.034022283576759e+03 ... c[4095][2]=1.028215220977093e+03
c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558351e+03
.......
c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03
c[0][4093]= 1.034889413448712e+03 ... c[4095][4093]=1.025519289347928e+03
c[0][4094]= 1.038085640812731e+03 ... c[4095][4094]=1.031131779777060e+03
c[0][4095]= 1.033145491256164e+03 ... c[4095][4095]=1.023042270704369e+03

Total Time : 27.774000 [sec]
Time of Allocation           : 0.001000 [sec]
Time of Initialization       : 0.572000 [sec]
Time of Operation for Matrix : 27.198999 [sec]

//---------------------
Parallel SIMD Program (1d ver.)
Matrix Size: 4096 X 4096.
Result:
c[0][0]= 1.037951225539264e+03 ... c[4095][0]=1.026167362779491e+03
c[0][1]= 1.031621032790638e+03 ... c[4095][1]=1.028450900218213e+03
c[0][2]= 1.034022283576759e+03 ... c[4095][2]=1.028215220977093e+03
c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558351e+03
.......
c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03
c[0][4093]= 1.034889413448712e+03 ... c[4095][4093]=1.025519289347928e+03
c[0][4094]= 1.038085640812731e+03 ... c[4095][4094]=1.031131779777060e+03
c[0][4095]= 1.033145491256164e+03 ... c[4095][4095]=1.023042270704369e+03

Total Time : 7.994000 [sec]
Time of Allocation           : 0.000000 [sec]
Time of Initialization       : 0.575000 [sec]
Time of Operation for Matrix : 7.419000 [sec]
実行結果(表)
並列化無し OpenMP AVX2 OpenMP + AVX2
Time of Operation for Matrix[s] 76.901001 11.563000 27.198999 7.419000
速度比[%] 100 15.04 35.37 9.65

考察

今回は前回に比べるとちゃんと高速化しました。
今回はループの中でも一番内側のkについてSIMDによる並列化を行いましたが、恐らくkよりjについて並列化を行ったほうが高速だと思います。
とりあえずメモ書き程度としてここまで。