講義で作ったOpenMPとAVX2命令を使って4096x4096の行列積を計算するプログラムです。そのまま並列化しろという内容だったので最適化はちゃんとしていませんが、折角作ったので公開しておきます。
ソースコード
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #include <immintrin.h> #define SIZE 4096 void MatrixMul_1_ST(double *A, double *B, double *C) { //シングルスレッドSIMD無し int i,j,k; double sum; double tmp; for (i=0;i<SIZE;i++){ for (j=i+1;j<SIZE;j++){ tmp = B[j*SIZE+i]; B[j*SIZE+i] = B[i*SIZE+j]; B[i*SIZE+j] = tmp; } } for (i=0; i<SIZE; i++){ for (j=0; j<SIZE; j++){ sum = 0.0; for (k=0; k<SIZE; k++) sum += A[i*SIZE+k]*B[j*SIZE+k]; C[i*SIZE+j] = sum; } } } void MatrixMul_1_MT(double *A, double *B, double *C) { //マルチスレッドSIMD無し int i,j,k; double sum; double tmp; #pragma omp parallel for private(j, tmp) for (i=0;i<SIZE;i++){ for (j=i+1;j<SIZE;j++){ tmp = B[j*SIZE+i]; B[j*SIZE+i] = B[i*SIZE+j]; B[i*SIZE+j] = tmp; } } #pragma omp parallel for private(j, k, sum) for (i=0; i<SIZE; i++){ for (j=0; j<SIZE; j++){ sum = 0.0; for (k=0; k<SIZE; k++) sum += A[i*SIZE+k]*B[j*SIZE+k]; C[i*SIZE+j] = sum; } } } void MatrixMul_1_ST_SIMD(double *A, double *B, double *C) { //シングルスレッドSIMD有り int i,j,k; double tmp; __m256d vsum; double *sum = (double*)(&vsum); for (i=0;i<SIZE;i++){ for (j=i+1;j<SIZE;j++){ tmp = B[j*SIZE+i]; B[j*SIZE+i] = B[i*SIZE+j]; B[i*SIZE+j] = tmp; } } for (i=0; i<SIZE; i++){ for (j=0; j<SIZE; j++){ vsum = _mm256_set1_pd(0.0); for (k=0; k<SIZE; k+=4){ vsum = _mm256_add_pd(vsum, _mm256_mul_pd( _mm256_load_pd(&A[i*SIZE+k]), _mm256_load_pd(&B[j*SIZE+k]))); } C[i*SIZE+j] = sum[0] + sum[1] + sum[2] + sum[3]; } } } void MatrixMul_1_MT_SIMD(double *A, double *B, double *C) { //マルチスレッドSIMD有り int i,j,k; double tmp; __m256d vsum; double *sum; #pragma omp parallel for private(j, tmp) for (i=0;i<SIZE;i++){ for (j=i+1;j<SIZE;j++){ tmp = B[j*SIZE+i]; B[j*SIZE+i] = B[i*SIZE+j]; B[i*SIZE+j] = tmp; } } #pragma omp parallel for private(j, k, vsum, sum) for (i=0; i<SIZE; i++){ sum = (double*)(&vsum); for (j=0; j<SIZE; j++){ vsum = _mm256_set1_pd(0.0); for (k=0; k<SIZE; k+=4){ //sizeが4指定なので後処理はしない vsum = _mm256_add_pd(vsum, _mm256_mul_pd( _mm256_load_pd(&A[i*SIZE+k]), _mm256_load_pd(&B[j*SIZE+k]))); } C[i*SIZE+j] = sum[0] + sum[1] + sum[2] + sum[3]; } } } void Measurement_1(char* Title, void (*MatrixMul_1)(double*, double*, double*)){ int i,j; double *a,*b,*c; clock_t ts,te; clock_t d1,d2,d3; printf("//---------------------\n"); printf(Title); printf("Matrix Size: %d X %d.\n",SIZE,SIZE); ts = clock(); //--------------------- a = (double*)malloc(SIZE*SIZE*sizeof(double)); b = (double*)malloc(SIZE*SIZE*sizeof(double)); c = (double*)malloc(SIZE*SIZE*sizeof(double)); d1 = clock(); //--------------------- srand(10); for(i=0; i<SIZE; i++){ for(j=0; j<SIZE; j++){ a[i*SIZE+j] = rand()/(double)RAND_MAX; b[i*SIZE+j] = rand()/(double)RAND_MAX; c[i*SIZE+j] = 0.0; } } d2 = clock(); //--------------------- MatrixMul_1(a,b,c); d3 = clock(); //--------------------- printf("Result:\n"); printf("c[0][0]= %.15e ... c[%d][0]=%.15e\n",c[0],SIZE-1,c[(SIZE-1)*SIZE+0]); printf("c[0][1]= %.15e ... c[%d][1]=%.15e\n",c[1],SIZE-1,c[(SIZE-1)*SIZE+1]); printf("c[0][2]= %.15e ... c[%d][2]=%.15e\n",c[2],SIZE-1,c[(SIZE-1)*SIZE+2]); printf("c[0][3]= %.15e ... c[%d][3]=%.15e\n",c[3],SIZE-1,c[(SIZE-1)*SIZE+3]); printf(".......\n"); printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-4,c[SIZE-4],SIZE-1,SIZE-4,c[(SIZE-1)*SIZE+SIZE-4]); printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-3,c[SIZE-3],SIZE-1,SIZE-3,c[(SIZE-1)*SIZE+SIZE-3]); printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-2,c[SIZE-2],SIZE-1,SIZE-2,c[(SIZE-1)*SIZE+SIZE-2]); printf("c[0][%d]= %.15e ... c[%d][%d]=%.15e\n",SIZE-1,c[SIZE-1],SIZE-1,SIZE-1,c[(SIZE-1)*SIZE+SIZE-1]); te = clock(); printf("\nTotal Time : %f [sec]\n", (float)(te-ts)/CLOCKS_PER_SEC); printf("Time of Allocation : %f [sec]\n", (float)(d1-ts)/CLOCKS_PER_SEC); printf("Time of Initialization : %f [sec]\n", (float)(d2-d1)/CLOCKS_PER_SEC); printf("Time of Operation for Matrix : %f [sec]\n\n", (float)(d3-d2)/CLOCKS_PER_SEC); //--------------------- free(a); free(b); free(c); //--------------------- } int main() { Measurement_1("Serial Program (1d ver.)\n", *MatrixMul_1_ST); Measurement_1("Parallel Program (1d ver.)\n", *MatrixMul_1_MT); Measurement_1("Serial SIMD Program (1d ver.)\n", *MatrixMul_1_ST_SIMD); Measurement_1("Parallel SIMD Program (1d ver.)\n", *MatrixMul_1_MT_SIMD); return 0; }
実行結果
//--------------------- Serial Program (1d ver.) Matrix Size: 4096 X 4096. Result: c[0][0]= 1.037951225539265e+03 ... c[4095][0]=1.026167362779490e+03 c[0][1]= 1.031621032790639e+03 ... c[4095][1]=1.028450900218210e+03 c[0][2]= 1.034022283576758e+03 ... c[4095][2]=1.028215220977095e+03 c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558352e+03 ....... c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03 c[0][4093]= 1.034889413448711e+03 ... c[4095][4093]=1.025519289347927e+03 c[0][4094]= 1.038085640812733e+03 ... c[4095][4094]=1.031131779777061e+03 c[0][4095]= 1.033145491256165e+03 ... c[4095][4095]=1.023042270704371e+03 Total Time : 77.487999 [sec] Time of Allocation : 0.001000 [sec] Time of Initialization : 0.586000 [sec] Time of Operation for Matrix : 76.901001 [sec] //--------------------- Parallel Program (1d ver.) Matrix Size: 4096 X 4096. Result: c[0][0]= 1.037951225539265e+03 ... c[4095][0]=1.026167362779490e+03 c[0][1]= 1.031621032790639e+03 ... c[4095][1]=1.028450900218210e+03 c[0][2]= 1.034022283576758e+03 ... c[4095][2]=1.028215220977095e+03 c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558352e+03 ....... c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03 c[0][4093]= 1.034889413448711e+03 ... c[4095][4093]=1.025519289347927e+03 c[0][4094]= 1.038085640812733e+03 ... c[4095][4094]=1.031131779777061e+03 c[0][4095]= 1.033145491256165e+03 ... c[4095][4095]=1.023042270704371e+03 Total Time : 12.139000 [sec] Time of Allocation : 0.001000 [sec] Time of Initialization : 0.575000 [sec] Time of Operation for Matrix : 11.563000 [sec] //--------------------- Serial SIMD Program (1d ver.) Matrix Size: 4096 X 4096. Result: c[0][0]= 1.037951225539264e+03 ... c[4095][0]=1.026167362779491e+03 c[0][1]= 1.031621032790638e+03 ... c[4095][1]=1.028450900218213e+03 c[0][2]= 1.034022283576759e+03 ... c[4095][2]=1.028215220977093e+03 c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558351e+03 ....... c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03 c[0][4093]= 1.034889413448712e+03 ... c[4095][4093]=1.025519289347928e+03 c[0][4094]= 1.038085640812731e+03 ... c[4095][4094]=1.031131779777060e+03 c[0][4095]= 1.033145491256164e+03 ... c[4095][4095]=1.023042270704369e+03 Total Time : 27.774000 [sec] Time of Allocation : 0.001000 [sec] Time of Initialization : 0.572000 [sec] Time of Operation for Matrix : 27.198999 [sec] //--------------------- Parallel SIMD Program (1d ver.) Matrix Size: 4096 X 4096. Result: c[0][0]= 1.037951225539264e+03 ... c[4095][0]=1.026167362779491e+03 c[0][1]= 1.031621032790638e+03 ... c[4095][1]=1.028450900218213e+03 c[0][2]= 1.034022283576759e+03 ... c[4095][2]=1.028215220977093e+03 c[0][3]= 1.034019113402438e+03 ... c[4095][3]=1.028631778558351e+03 ....... c[0][4092]= 1.034580439819138e+03 ... c[4095][4092]=1.042994714743114e+03 c[0][4093]= 1.034889413448712e+03 ... c[4095][4093]=1.025519289347928e+03 c[0][4094]= 1.038085640812731e+03 ... c[4095][4094]=1.031131779777060e+03 c[0][4095]= 1.033145491256164e+03 ... c[4095][4095]=1.023042270704369e+03 Total Time : 7.994000 [sec] Time of Allocation : 0.000000 [sec] Time of Initialization : 0.575000 [sec] Time of Operation for Matrix : 7.419000 [sec]