C++ Cuda performance for double pointers

I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.

Some suggestions to save some time/memory?

I really want to use dynamic 2d array.

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <cstdio>



__global__ void fct(int **dev_c)

{

    int y = threadIdx.x;

    int x = threadIdx.y;

    dev_c[y][x] = 3;

}



int main(void)

{

    //Output Array

    int **cc = new int*[2];

    for (int i = 0; i < 2; i++)cc[i] = new int[2];

    //Host Array

    int ** h_c = (int **)malloc(2 * sizeof(int *));

    for (int i = 0; i < 2; i++) {

        cudaMalloc((void**)&h_c[i], 2 * sizeof(int));

    }

    //Devie array

    int ** d_c;

    cudaMalloc((void **)&d_c, 2 * sizeof(int *));

    cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);





    dim3 d(2, 2);

    fct << <1, d >> > (d_c);



    for (int i = 0; i < 2; i++) {

        cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);

    }



    for (int i = 0; i < 2; i++) {

        for (int j = 0; j < 2; j++) {

            printf("(%d,%d):%dn", i, j, cc[i][j]);

        }

    }

    int x;

    std::cin >> x;

    delete h_c;

    delete d_c;

}

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31

Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50

add a comment |

I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.

Some suggestions to save some time/memory?

I really want to use dynamic 2d array.

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <cstdio>



__global__ void fct(int **dev_c)

{

    int y = threadIdx.x;

    int x = threadIdx.y;

    dev_c[y][x] = 3;

}



int main(void)

{

    //Output Array

    int **cc = new int*[2];

    for (int i = 0; i < 2; i++)cc[i] = new int[2];

    //Host Array

    int ** h_c = (int **)malloc(2 * sizeof(int *));

    for (int i = 0; i < 2; i++) {

        cudaMalloc((void**)&h_c[i], 2 * sizeof(int));

    }

    //Devie array

    int ** d_c;

    cudaMalloc((void **)&d_c, 2 * sizeof(int *));

    cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);





    dim3 d(2, 2);

    fct << <1, d >> > (d_c);



    for (int i = 0; i < 2; i++) {

        cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);

    }



    for (int i = 0; i < 2; i++) {

        for (int j = 0; j < 2; j++) {

            printf("(%d,%d):%dn", i, j, cc[i][j]);

        }

    }

    int x;

    std::cin >> x;

    delete h_c;

    delete d_c;

}

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31

Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50

add a comment |

I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.

Some suggestions to save some time/memory?

I really want to use dynamic 2d array.

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <cstdio>



__global__ void fct(int **dev_c)

{

    int y = threadIdx.x;

    int x = threadIdx.y;

    dev_c[y][x] = 3;

}



int main(void)

{

    //Output Array

    int **cc = new int*[2];

    for (int i = 0; i < 2; i++)cc[i] = new int[2];

    //Host Array

    int ** h_c = (int **)malloc(2 * sizeof(int *));

    for (int i = 0; i < 2; i++) {

        cudaMalloc((void**)&h_c[i], 2 * sizeof(int));

    }

    //Devie array

    int ** d_c;

    cudaMalloc((void **)&d_c, 2 * sizeof(int *));

    cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);





    dim3 d(2, 2);

    fct << <1, d >> > (d_c);



    for (int i = 0; i < 2; i++) {

        cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);

    }



    for (int i = 0; i < 2; i++) {

        for (int j = 0; j < 2; j++) {

            printf("(%d,%d):%dn", i, j, cc[i][j]);

        }

    }

    int x;

    std::cin >> x;

    delete h_c;

    delete d_c;

}

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.

Some suggestions to save some time/memory?

I really want to use dynamic 2d array.

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <cstdio>



__global__ void fct(int **dev_c)

{

    int y = threadIdx.x;

    int x = threadIdx.y;

    dev_c[y][x] = 3;

}



int main(void)

{

    //Output Array

    int **cc = new int*[2];

    for (int i = 0; i < 2; i++)cc[i] = new int[2];

    //Host Array

    int ** h_c = (int **)malloc(2 * sizeof(int *));

    for (int i = 0; i < 2; i++) {

        cudaMalloc((void**)&h_c[i], 2 * sizeof(int));

    }

    //Devie array

    int ** d_c;

    cudaMalloc((void **)&d_c, 2 * sizeof(int *));

    cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);





    dim3 d(2, 2);

    fct << <1, d >> > (d_c);



    for (int i = 0; i < 2; i++) {

        cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);

    }



    for (int i = 0; i < 2; i++) {

        for (int j = 0; j < 2; j++) {

            printf("(%d,%d):%dn", i, j, cc[i][j]);

        }

    }

    int x;

    std::cin >> x;

    delete h_c;

    delete d_c;

}

c++ pointers cuda

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

asked Dec 30 '18 at 14:34

Vlad Constantinescu

344

If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31

Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50

add a comment |

If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31

Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50

If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31

Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50

add a comment |

1 Answer
1

active

oldest

votes

You may actually want to use flattened matrix with some pointer tricks:

int main() {

    const int size = 10;



    auto arr = new int*[size];

    arr[0] = new int[size * size];

    for(int i = 1; i < size; i++) {

        arr[i] = arr[0] + (i * size);

    }

}

This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).

*_{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

|
show 2 more comments

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53978493%2fc-cuda-performance-for-double-pointers%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

You may actually want to use flattened matrix with some pointer tricks:

int main() {

    const int size = 10;



    auto arr = new int*[size];

    arr[0] = new int[size * size];

    for(int i = 1; i < size; i++) {

        arr[i] = arr[0] + (i * size);

    }

}

*_{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

|
show 2 more comments

You may actually want to use flattened matrix with some pointer tricks:

int main() {

    const int size = 10;



    auto arr = new int*[size];

    arr[0] = new int[size * size];

    for(int i = 1; i < size; i++) {

        arr[i] = arr[0] + (i * size);

    }

}

*_{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

|
show 2 more comments

You may actually want to use flattened matrix with some pointer tricks:

int main() {

    const int size = 10;



    auto arr = new int*[size];

    arr[0] = new int[size * size];

    for(int i = 1; i < size; i++) {

        arr[i] = arr[0] + (i * size);

    }

}

*_{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

You may actually want to use flattened matrix with some pointer tricks:

int main() {

    const int size = 10;



    auto arr = new int*[size];

    arr[0] = new int[size * size];

    for(int i = 1; i < size; i++) {

        arr[i] = arr[0] + (i * size);

    }

}

*_{It is faster to allocate size * size memory once, rather than allocating size times size elements.}

Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

edited Dec 30 '18 at 15:22

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

answered Dec 30 '18 at 14:41

Fureeish

3,27321029

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

|
show 2 more comments

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33

@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34

I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37

You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49

i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54

|
show 2 more comments

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

U7 i,O6YsHZHEqT8k8xSjLPo49 jqnhQCP9Rr13AQGgPn3Q,EMLH hRYV37xxICn

搜尋此網誌

Bdtjtk