C++ Cuda performance for double pointers












0















I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.



Some suggestions to save some time/memory?



I really want to use dynamic 2d array.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>

__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}

int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);


dim3 d(2, 2);
fct << <1, d >> > (d_c);

for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}

for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}









share|improve this question























  • If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

    – Robert Crovella
    Dec 30 '18 at 16:31











  • Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

    – Vlad Constantinescu
    Dec 30 '18 at 23:50
















0















I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.



Some suggestions to save some time/memory?



I really want to use dynamic 2d array.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>

__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}

int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);


dim3 d(2, 2);
fct << <1, d >> > (d_c);

for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}

for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}









share|improve this question























  • If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

    – Robert Crovella
    Dec 30 '18 at 16:31











  • Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

    – Vlad Constantinescu
    Dec 30 '18 at 23:50














0












0








0








I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.



Some suggestions to save some time/memory?



I really want to use dynamic 2d array.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>

__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}

int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);


dim3 d(2, 2);
fct << <1, d >> > (d_c);

for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}

for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}









share|improve this question














I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.



Some suggestions to save some time/memory?



I really want to use dynamic 2d array.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>

__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}

int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);


dim3 d(2, 2);
fct << <1, d >> > (d_c);

for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}

for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}






c++ pointers cuda






share|improve this question













share|improve this question











share|improve this question




share|improve this question










asked Dec 30 '18 at 14:34









Vlad ConstantinescuVlad Constantinescu

344




344













  • If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

    – Robert Crovella
    Dec 30 '18 at 16:31











  • Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

    – Vlad Constantinescu
    Dec 30 '18 at 23:50



















  • If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

    – Robert Crovella
    Dec 30 '18 at 16:31











  • Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

    – Vlad Constantinescu
    Dec 30 '18 at 23:50

















If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31





If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.

– Robert Crovella
Dec 30 '18 at 16:31













Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50





Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.

– Vlad Constantinescu
Dec 30 '18 at 23:50












1 Answer
1






active

oldest

votes


















0














You may actually want to use flattened matrix with some pointer tricks:



int main() {
const int size = 10;

auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}


This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).



*It is faster to allocate size * size memory once, rather than allocating size times size elements.



Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.






share|improve this answer


























  • Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

    – Vlad Constantinescu
    Dec 30 '18 at 15:33













  • @VladConstantinescu What do you mean by simple vector?

    – Fureeish
    Dec 30 '18 at 15:34













  • I mean , 1d array ( int *a = new int[size])

    – Vlad Constantinescu
    Dec 30 '18 at 15:37











  • You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

    – Fureeish
    Dec 30 '18 at 15:49











  • i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

    – Vlad Constantinescu
    Dec 30 '18 at 15:54











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53978493%2fc-cuda-performance-for-double-pointers%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes









0














You may actually want to use flattened matrix with some pointer tricks:



int main() {
const int size = 10;

auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}


This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).



*It is faster to allocate size * size memory once, rather than allocating size times size elements.



Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.






share|improve this answer


























  • Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

    – Vlad Constantinescu
    Dec 30 '18 at 15:33













  • @VladConstantinescu What do you mean by simple vector?

    – Fureeish
    Dec 30 '18 at 15:34













  • I mean , 1d array ( int *a = new int[size])

    – Vlad Constantinescu
    Dec 30 '18 at 15:37











  • You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

    – Fureeish
    Dec 30 '18 at 15:49











  • i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

    – Vlad Constantinescu
    Dec 30 '18 at 15:54
















0














You may actually want to use flattened matrix with some pointer tricks:



int main() {
const int size = 10;

auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}


This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).



*It is faster to allocate size * size memory once, rather than allocating size times size elements.



Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.






share|improve this answer


























  • Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

    – Vlad Constantinescu
    Dec 30 '18 at 15:33













  • @VladConstantinescu What do you mean by simple vector?

    – Fureeish
    Dec 30 '18 at 15:34













  • I mean , 1d array ( int *a = new int[size])

    – Vlad Constantinescu
    Dec 30 '18 at 15:37











  • You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

    – Fureeish
    Dec 30 '18 at 15:49











  • i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

    – Vlad Constantinescu
    Dec 30 '18 at 15:54














0












0








0







You may actually want to use flattened matrix with some pointer tricks:



int main() {
const int size = 10;

auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}


This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).



*It is faster to allocate size * size memory once, rather than allocating size times size elements.



Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.






share|improve this answer















You may actually want to use flattened matrix with some pointer tricks:



int main() {
const int size = 10;

auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}


This way, you can still access the matrix with arr[x][y] syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).



*It is faster to allocate size * size memory once, rather than allocating size times size elements.



Side note: using delete on a malloced memory is undefined behaviour. Don't mix new/new + delete/delete with malloc + free.







share|improve this answer














share|improve this answer



share|improve this answer








edited Dec 30 '18 at 15:22

























answered Dec 30 '18 at 14:41









FureeishFureeish

3,27321029




3,27321029













  • Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

    – Vlad Constantinescu
    Dec 30 '18 at 15:33













  • @VladConstantinescu What do you mean by simple vector?

    – Fureeish
    Dec 30 '18 at 15:34













  • I mean , 1d array ( int *a = new int[size])

    – Vlad Constantinescu
    Dec 30 '18 at 15:37











  • You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

    – Fureeish
    Dec 30 '18 at 15:49











  • i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

    – Vlad Constantinescu
    Dec 30 '18 at 15:54



















  • Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

    – Vlad Constantinescu
    Dec 30 '18 at 15:33













  • @VladConstantinescu What do you mean by simple vector?

    – Fureeish
    Dec 30 '18 at 15:34













  • I mean , 1d array ( int *a = new int[size])

    – Vlad Constantinescu
    Dec 30 '18 at 15:37











  • You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

    – Fureeish
    Dec 30 '18 at 15:49











  • i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

    – Vlad Constantinescu
    Dec 30 '18 at 15:54

















Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33







Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?

– Vlad Constantinescu
Dec 30 '18 at 15:33















@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34







@VladConstantinescu What do you mean by simple vector?

– Fureeish
Dec 30 '18 at 15:34















I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37





I mean , 1d array ( int *a = new int[size])

– Vlad Constantinescu
Dec 30 '18 at 15:37













You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49





You can pass this matrix as a 1d array simply by passing *arr (or arr[0]), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?

– Fureeish
Dec 30 '18 at 15:49













i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54





i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?

– Vlad Constantinescu
Dec 30 '18 at 15:54


















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53978493%2fc-cuda-performance-for-double-pointers%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Monofisismo

Angular Downloading a file using contenturl with Basic Authentication

Olmecas