C++ Cuda performance for double pointers
I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.
Some suggestions to save some time/memory?
I really want to use dynamic 2d array.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>
__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}
int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);
dim3 d(2, 2);
fct << <1, d >> > (d_c);
for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}
c++ pointers cuda
add a comment |
I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.
Some suggestions to save some time/memory?
I really want to use dynamic 2d array.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>
__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}
int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);
dim3 d(2, 2);
fct << <1, d >> > (d_c);
for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}
c++ pointers cuda
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50
add a comment |
I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.
Some suggestions to save some time/memory?
I really want to use dynamic 2d array.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>
__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}
int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);
dim3 d(2, 2);
fct << <1, d >> > (d_c);
for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}
c++ pointers cuda
I finally succeded in saving in memory a double pointer in order to use it in cuda.(The code below), but i see that is less performent than if i would flatten the matrix,which is not that great.
Some suggestions to save some time/memory?
I really want to use dynamic 2d array.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <cstdio>
__global__ void fct(int **dev_c)
{
int y = threadIdx.x;
int x = threadIdx.y;
dev_c[y][x] = 3;
}
int main(void)
{
//Output Array
int **cc = new int*[2];
for (int i = 0; i < 2; i++)cc[i] = new int[2];
//Host Array
int ** h_c = (int **)malloc(2 * sizeof(int *));
for (int i = 0; i < 2; i++) {
cudaMalloc((void**)&h_c[i], 2 * sizeof(int));
}
//Devie array
int ** d_c;
cudaMalloc((void **)&d_c, 2 * sizeof(int *));
cudaMemcpy(d_c, h_c, 2 * sizeof(int *), cudaMemcpyHostToDevice);
dim3 d(2, 2);
fct << <1, d >> > (d_c);
for (int i = 0; i < 2; i++) {
cudaMemcpy(cc[i], h_c[i], 2 * sizeof(int), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
printf("(%d,%d):%dn", i, j, cc[i][j]);
}
}
int x;
std::cin >> x;
delete h_c;
delete d_c;
}
c++ pointers cuda
c++ pointers cuda
asked Dec 30 '18 at 14:34
Vlad ConstantinescuVlad Constantinescu
344
344
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50
add a comment |
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50
add a comment |
1 Answer
1
active
oldest
votes
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
*It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete
on a malloc
ed memory is undefined behaviour. Don't mix new
/new
+ delete
/delete
with malloc
+ free
.
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean bysimple vector
?
– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing*arr
(orarr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?
– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
|
show 2 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53978493%2fc-cuda-performance-for-double-pointers%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
*It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete
on a malloc
ed memory is undefined behaviour. Don't mix new
/new
+ delete
/delete
with malloc
+ free
.
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean bysimple vector
?
– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing*arr
(orarr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?
– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
|
show 2 more comments
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
*It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete
on a malloc
ed memory is undefined behaviour. Don't mix new
/new
+ delete
/delete
with malloc
+ free
.
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean bysimple vector
?
– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing*arr
(orarr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?
– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
|
show 2 more comments
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
*It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete
on a malloc
ed memory is undefined behaviour. Don't mix new
/new
+ delete
/delete
with malloc
+ free
.
You may actually want to use flattened matrix with some pointer tricks:
int main() {
const int size = 10;
auto arr = new int*[size];
arr[0] = new int[size * size];
for(int i = 1; i < size; i++) {
arr[i] = arr[0] + (i * size);
}
}
This way, you can still access the matrix with arr[x][y]
syntax, but the actual memory is contiguous (which is not only faster to allocate*, but faster to access, given cache pre-fetching memory around the one you desire to use).
*It is faster to allocate size * size
memory once, rather than allocating size
times size
elements.
Side note: using delete
on a malloc
ed memory is undefined behaviour. Don't mix new
/new
+ delete
/delete
with malloc
+ free
.
edited Dec 30 '18 at 15:22
answered Dec 30 '18 at 14:41
FureeishFureeish
3,27321029
3,27321029
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean bysimple vector
?
– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing*arr
(orarr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?
– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
|
show 2 more comments
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean bysimple vector
?
– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing*arr
(orarr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?
– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
Thank you! Still, if i wanna save that arr in cuda memory i can do it like a simple vector?Or i have to alloc every row?
– Vlad Constantinescu
Dec 30 '18 at 15:33
@VladConstantinescu What do you mean by
simple vector
?– Fureeish
Dec 30 '18 at 15:34
@VladConstantinescu What do you mean by
simple vector
?– Fureeish
Dec 30 '18 at 15:34
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
I mean , 1d array ( int *a = new int[size])
– Vlad Constantinescu
Dec 30 '18 at 15:37
You can pass this matrix as a 1d array simply by passing
*arr
(or arr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?– Fureeish
Dec 30 '18 at 15:49
You can pass this matrix as a 1d array simply by passing
*arr
(or arr[0]
), if that's what you are asking. It's still a little unclear. Do you want to pass the whole matrix as a single 1d array? Or you want to pass single rows as 1d arrays?– Fureeish
Dec 30 '18 at 15:49
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
i want to pass to the kernel function(fct) the whole 2d matrix. There is a way that i will still be able to use the syntax matrix[i][j] inside the function ?
– Vlad Constantinescu
Dec 30 '18 at 15:54
|
show 2 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53978493%2fc-cuda-performance-for-double-pointers%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
If you know the width of your 2D matrix at compile time, it's possible to use doubly-subscripted access (even in device code) while still maintaining the performance benefits of indexed vs. double-pointer access. This answer discusses various methods including the known-width approach. If you do not know the width of your 2D matrix at compile time, I'm not aware of any method to do doubly-subscripted access without 2 pointer dereferences per access.
– Robert Crovella
Dec 30 '18 at 16:31
Thank you! I will keep in mind this but sadly for this project i need that array to be dynamic.
– Vlad Constantinescu
Dec 30 '18 at 23:50