MultiGPUMatMul/main.cpp at main · hoyathali/MultiGPUMatMul · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#include <algorithm>
#include <boost/mpi/datatype_fwd.hpp>
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <fstream>

#include <mpi.h>
#include <boost/mpi/datatype.hpp>

#include "mult.cuh"

#define verbose false //For printing matrices row received data

struct genMatrix_A {
    unsigned int nRows;
    unsigned int nCols;
    int counter = 0;

    genMatrix_A(unsigned int nRows, unsigned int nCols, float init=0): nRows(nRows), nCols(nCols), counter(init) {}

    float operator()()
    {
	//return 1;
	return (++counter);
    }
};

struct genMatrix_B {
    unsigned int nRows;
    unsigned int nCols;
    int counter = 0;

    genMatrix_B(unsigned int nRows, unsigned int nCols, float init=0): nRows(nRows), nCols(nCols), counter(init) {}

    float operator()()
    {
	return 1;
	//return (++counter);
    }
};


void matrixMult()
{
    int rank, size;
    float* column = (float*)calloc(BAND_SIZE * K_GLOBAL, sizeof(float)); // Buffer to receive the column
    float* row = (float*)calloc(BAND_SIZE * K_GLOBAL, sizeof(float)); // Buffer to receive the column
    float* res = (float*)calloc(BAND_SIZE * BAND_SIZE, sizeof(float)); // Buffer to receive the column

    float *d_column = nullptr, *d_row = nullptr, *d_res = nullptr;
    gpuErrchk( cudaMalloc((void**)&d_column, BAND_SIZE * K_GLOBAL * sizeof(float)) );
    gpuErrchk( cudaMalloc((void**)&d_row, BAND_SIZE * K_GLOBAL * sizeof(float)) );
    gpuErrchk( cudaMalloc((void**)&d_res, BAND_SIZE * BAND_SIZE * sizeof(float)) );
    gpuErrchk( cudaMemset(d_column, 0, BAND_SIZE * K_GLOBAL * sizeof(float)) );
    gpuErrchk( cudaMemset(d_row, 0, BAND_SIZE * K_GLOBAL * sizeof(float)) );

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    std::vector<float> matrix_A;
    std::vector<float> matrix_B;
    std::vector<float> matrix_C;

    matrix_A.resize(M_GLOBAL*K_GLOBAL);
    matrix_B.resize(K_GLOBAL*N_GLOBAL);
    matrix_C.resize(M_GLOBAL*N_GLOBAL);

    // std::generate(matrix_A.begin(), matrix_A.end(), [n = 0] () mutable { return n++; });
    std::generate(matrix_A.begin(), matrix_A.end(), genMatrix_A(M_GLOBAL, K_GLOBAL));
    std::generate(matrix_B.begin(), matrix_B.end(), genMatrix_B(K_GLOBAL, N_GLOBAL));


    // Process 0 prints the original matrix_B
    if (rank == 0) {

        if(verbose){

        std::cout<<"Original matrix_A:"<<std::endl;
        for (int i = 0; i < M_GLOBAL; i++) {
            for (int j = 0; j < K_GLOBAL; j++)
                std::cout<<matrix_A[i*K_GLOBAL + j] << "\t";
            std::cout<<std::endl;
        }
        std::cout<<std::endl;

        std::cout<<"Original matrix_B:"<<std::endl;;
        for (int i = 0; i < K_GLOBAL; i++) {
            for (int j = 0; j < N_GLOBAL; j++)
                std::cout<<matrix_B[i*N_GLOBAL + j] << "\t";
            std::cout<<std::endl;
        }
     }

    std::cout<<std::endl;
    std::cout<<"Matrix A size: "<<M_GLOBAL<<" * "<<K_GLOBAL<<std::endl;
    std::cout<<"Matrix B size: "<<K_GLOBAL<<" * "<<N_GLOBAL<<std::endl;
    std::cout<<"Band size: " << BAND_SIZE<<std::endl;

    }


    // Define the datatype for a column
    MPI_Datatype col, coltype;
    MPI_Type_vector(K_GLOBAL, BAND_SIZE, N_GLOBAL, boost::mpi::get_mpi_datatype<float>(), &col);
    MPI_Type_commit(&col);
    MPI_Type_create_resized(col, 0, BAND_SIZE*sizeof(float), &coltype);
    MPI_Type_commit(&coltype);

    // Define the datatype for a column
    MPI_Datatype C_col, C_coltype;
    MPI_Type_vector(BAND_SIZE, BAND_SIZE, N_GLOBAL, boost::mpi::get_mpi_datatype<float>(), &C_col);
    MPI_Type_commit(&C_col);
    MPI_Type_create_resized(C_col, 0, BAND_SIZE*sizeof(float), &C_coltype);
    MPI_Type_commit(&C_coltype);

    bool forward=true;
    bool switched=false;
    int r=0;

    for(int c=0; (c+rank)*BAND_SIZE < N_GLOBAL; c+=size)
    {
	// Scatter the columns of the matrix_B
	MPI_Scatter(matrix_B.data() + (c+rank)*BAND_SIZE, 1, coltype, column, BAND_SIZE*K_GLOBAL, boost::mpi::get_mpi_datatype<float>(), 0, MPI_COMM_WORLD);
	gpuErrchk( cudaMemcpy(d_column, column, BAND_SIZE * K_GLOBAL * sizeof(float), cudaMemcpyHostToDevice) );

	for(; ; forward ? r++:r--)
	{
	    // Broadcast the rows of the matrix_A
	    if(!switched)
	    {
		if (rank == 0)
		{
		    //cudaMemcpy(d_row, matrix_A.data() + r * K_GLOBAL * BAND_SIZE, BAND_SIZE * K_GLOBAL * sizeof(float), cudaMemcpyHostToDevice);
		    memcpy(row, matrix_A.data() + r * K_GLOBAL * BAND_SIZE, BAND_SIZE * K_GLOBAL * sizeof(float));
		}
		MPI_Bcast(row, BAND_SIZE * K_GLOBAL, boost::mpi::get_mpi_datatype<float>(), 0, MPI_COMM_WORLD);
		gpuErrchk( cudaMemcpy(d_row, row, BAND_SIZE * K_GLOBAL * sizeof(float), cudaMemcpyHostToDevice) );
	    }
	    switched=false;

	    computeMM(d_row, d_column, d_res , BAND_SIZE, K_GLOBAL, BAND_SIZE);
	    //cublasMM(d_row, d_column, d_res , BAND_SIZE, K_GLOBAL, BAND_SIZE);

	    gpuErrchk( cudaMemcpy(res, d_res, BAND_SIZE * BAND_SIZE * sizeof(float), cudaMemcpyDeviceToHost) );
	    //MPI_Gather(res, BAND_SIZE*BAND_SIZE, boost::mpi::get_mpi_datatype<float>(), matrix_C.data() + r * N_GLOBAL * BAND_SIZE + c * BAND_SIZE, 1, C_coltype, 0, MPI_COMM_WORLD);
	    for(int k=0; k<BAND_SIZE; k++)
	    {
		MPI_Gather(res+k*BAND_SIZE, BAND_SIZE, boost::mpi::get_mpi_datatype<float>(), matrix_C.data() + (r * BAND_SIZE+k) * N_GLOBAL + c * BAND_SIZE, BAND_SIZE, boost::mpi::get_mpi_datatype<float>(), 0, MPI_COMM_WORLD);
	    }


	    if(verbose)
	    {
		// Each process prints the received column
		std::cout<<"Process "<<rank<<" received row band: ";
		for (int i = 0; i < BAND_SIZE * K_GLOBAL; i++)
		{
		    float temp;
		    cudaMemcpy(&temp, d_row+i, sizeof(float), cudaMemcpyDeviceToHost);
		    std::cout<<temp<<" ";
		}
		std::cout<<std::endl;

		// Each process prints the received column
		std::cout<<"Process "<<rank<<" received column band: ";
		for (int i = 0; i < BAND_SIZE * K_GLOBAL; i++)
		{
		    float temp;
		    cudaMemcpy(&temp, d_column+i, sizeof(float), cudaMemcpyDeviceToHost);
		    std::cout<<temp<<" ";
		}
		std::cout<<std::endl;

		// Each process prints the resultant matrix
		std::cout<<"Process "<<rank<<" computed ("<<r<<" "<<c<<"): ";
		for (int i = 0; i < BAND_SIZE * BAND_SIZE; i++)
		{
		    float temp;
		    cudaMemcpy(&temp, d_res+i, sizeof(float), cudaMemcpyDeviceToHost);
		    std::cout<<temp<<" ";
		}
		std::cout<<std::endl;
	    }

	    //Handing iterator logic to benefit from one overlap in every iteration
	    if(r==0 && !forward)
	    {
		forward=true;
		switched=true;
		break;
	    }
	    if(r==(M_GLOBAL/BAND_SIZE) -1 && forward)
	    {
		forward=false;
		switched=true;
		break;
	    }
	}
    }

    // Process 0 prints the original matrix_B
    if (rank == 0) {

    // Open a file in write mode.
     std::ofstream outFile("mpi_matrix_output.txt");
      if(verbose){
         std::cout<<"Computed matrix_C:"<<std::endl;
          }
        for (int i = 0; i < M_GLOBAL; i++) {
            for (int j = 0; j < N_GLOBAL; j++){
               if(verbose){
                std::cout<<matrix_C[i*N_GLOBAL + j] << "\t";
               }
                outFile<<matrix_C[i*N_GLOBAL + j] << "\t";
        }
          outFile<<"\n";
         if(verbose){
          std::cout<<std::endl;
         }
    }
      std::cout<<std::endl<<"Matrix Multiplication Completed!"<<std::endl;
      outFile.close();

    }

    MPI_Type_free(&coltype);
    MPI_Type_free(&col);

    gpuErrchk( cudaFree(d_column) );
    gpuErrchk( cudaFree(d_row) );
    gpuErrchk( cudaFree(d_res) );
    free(column);
    free(row);
    free(res);
}

int main(int argc, char *argv[]) {
    int dev=0;

    gpuErrchk(cudaGetDeviceProperties(&deviceProp, dev));

    // Tensor cores require a GPU of Volta (SM8X) architecture or higher.
    if (deviceProp.major < 8) {
        printf("tf32TensorCoreGemm requires requires SM 8.0 or higher to use Tensor Cores.  Exiting...\n");
        exit(1);
    }

    int rank, size;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    static_assert( M_GLOBAL % BAND_SIZE == 0 );
    static_assert( N_GLOBAL % BAND_SIZE == 0 );
    if (std::min(M_GLOBAL, N_GLOBAL)%(size*BAND_SIZE) != 0) {
        if (rank == 0)
            printf("Prereq issue.\n");
        MPI_Finalize();
        return 1;
    }

    matrixMult();

    MPI_Finalize();
    return 0;
}