所以这是我的程序的一部分,我为两个班级做了一个减少总和。我用共享数组
索引
我有一个测试用例的大小,用
我想调用一个块数等于我的测试数(10000)的内核,但是总和有一些问题,所以我改为逐步进行。
我找不到解决方案,但是每当我调用块数超过
此处的 Cuda 功能是 2.0,即 GT 520 卡。使用 CUDA 6.5 编译。
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\
", cudaGetErrorString(code), file, line);
}
}
#define MAX_CLASSES 2
#define max_threads 64
//#define MAX_FEATURES 65
__device__ __constant__ int d_MAX_SIZE;
__device__ __constant__ int offset;
__device__ void rules_points_reduction(float points[max_threads * MAX_CLASSES], int nrules[max_threads * MAX_CLASSES]){
float psum[MAX_CLASSES];
int nsum[MAX_CLASSES];
for (int i = 0; i < MAX_CLASSES; i++){
psum[i] = points[threadIdx.x + i * blockDim.x];
nsum[i] = nrules[threadIdx.x + i * blockDim.x];
}
__syncthreads();
if (blockDim.x >= 1024) {
if (threadIdx.x < 512) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 512 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 512 + i * blockDim.x];
}
} __syncthreads();
}
if (blockDim.x >= 512) {
if (threadIdx.x < 256) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 256 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 256 + i * blockDim.x];
}
} __syncthreads();
}
if (blockDim.x >= 256) {
if (threadIdx.x < 128) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 128 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 128 + i * blockDim.x];
}
} __syncthreads();
}
if (blockDim.x >= 128) {
if (threadIdx.x < 64) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 64 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 64 + i * blockDim.x];
}
} __syncthreads();
}
if (threadIdx.x < 32)
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
//volatile int* smem = nrules;
//volatile float* smemf = points;
if (blockDim.x >= 64) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 32 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 32 + i * blockDim.x];
}
}
if (blockDim.x >= 32) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 16 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 16 + i * blockDim.x];
}
}
if (blockDim.x >= 16) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 8 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 8 + i * blockDim.x];
}
}
if (blockDim.x >= 8) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 4 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 4 + i * blockDim.x];
}
}
if (blockDim.x >= 4) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 2 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 2 + i * blockDim.x];
}
}
if (blockDim.x >= 2) {
for (int i = 0; i < MAX_CLASSES; i++){
points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 1 + i * blockDim.x];
nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 1 + i * blockDim.x];
}
}
}
}
__device__ void d_get_THE_prediction(short k, float* finalpoints, int* gn_rules)
{
int max;
short true_label, n_items;
__shared__ float points[max_threads * MAX_CLASSES];
__shared__ int nrules[max_threads * MAX_CLASSES];
//__shared__ short items[MAX_FEATURES], ele[MAX_FEATURES];
__shared__ int max2;
for (int i = 0; i < MAX_CLASSES; i++)
{
points[threadIdx.x + i * blockDim.x] = 1;
nrules[threadIdx.x + i * blockDim.x] = 1;
}
if (threadIdx.x == 0) {
if (k == 1){
nrules[0] = 1;
nrules[blockDim.x] = 1;
}
//max2 = GetBinCoeff_l_d(n_items, k);
}
__syncthreads();
//max = max2;
//d_induce_rules(items, ele, n_items, k, max, nrules, points);
__syncthreads();
rules_points_reduction(points, nrules);
if (threadIdx.x == 0){
for (int i = 0; i < MAX_CLASSES; i++){
gn_rules[(blockIdx.x + offset) + i * blockDim.x] += nrules[i * blockDim.x];
finalpoints[(blockIdx.x + offset) + i * blockDim.x] += points[i * blockDim.x];
}
printf("block %d k%d %f %f %d %d\
", (blockIdx.x + offset), k, finalpoints[(blockIdx.x + offset)],
finalpoints[(blockIdx.x + offset) + blockDim.x], gn_rules[(blockIdx.x + offset)], gn_rules[(blockIdx.x + offset) + blockDim.x]);
}
}
__global__ void lazy_supervised_classification_kernel(int k, float* finalpoints, int* n_rules){
d_get_THE_prediction( k, finalpoints, n_rules);
}
int main() {
//freopen("output.txt","w", stdout);
int N_TESTS = 10000;
int MAX_SIZE = 3;
float *finalpoints = (float*)calloc(MAX_CLASSES * N_TESTS, sizeof(float));
float *d_finalpoints = 0;
int *d_nruls = 0;
int *nruls = (int*)calloc(MAX_CLASSES * N_TESTS, sizeof(int));
gpuErrchk(cudaMalloc(&d_finalpoints, MAX_CLASSES * N_TESTS * sizeof(float)));
gpuErrchk(cudaMemset(d_finalpoints, 0, MAX_CLASSES * N_TESTS * sizeof(float)));
gpuErrchk(cudaMalloc(&d_nruls, MAX_CLASSES * N_TESTS * sizeof(int)));
gpuErrchk(cudaMemset(d_nruls, 0, MAX_CLASSES * N_TESTS * sizeof(int)));
gpuErrchk(cudaMemcpyToSymbol(d_MAX_SIZE, &MAX_SIZE, sizeof(int), 0, cudaMemcpyHostToDevice));
int step = max_threads, ofset = 0;
for (int k = 1; k < MAX_SIZE; k++){
//N_TESTS-step
for (ofset = 0; ofset < max_threads; ofset += step){
gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));
lazy_supervised_classification_kernel <<<step, max_threads >>>(k, d_finalpoints, d_nruls);
gpuErrchk(cudaDeviceSynchronize());
}
gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));//comment these lines
//N_TESTS - step
lazy_supervised_classification_kernel <<<3, max_threads >> >(k, d_finalpoints, d_nruls);//
gpuErrchk(cudaDeviceSynchronize());//
}
gpuErrchk(cudaFree(d_finalpoints));
gpuErrchk(cudaFree(d_nruls));
free(finalpoints);
free(nruls);
gpuErrchk(cudaDeviceReset());
return(0);
}我不相信这个索引是你想要的:
gn_rules[(blockIdx.x + offset) + i * blockDim.x] += ...;
finalpoints[(blockIdx.x + offset) + i * blockDim.x] += ...;对于
所以如果你把上面的代码行改成:
gn_rules[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += nrules[i * blockDim.x];
finalpoints[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += points[i * blockDim.x];我相信你会得到你期望的输出。
作为我的Rails应用程序的一部分,我编写了一个小导入程序,它从我们的LDAP系统中吸取数据并将其塞入一个用户表中。不幸的是,与LDAP相关的代码在遍历我们的32K用户时泄漏了大量内存,我一直无法弄清楚如何解决这个问题。这个问题似乎在某种程度上与LDAP库有关,因为当我删除对LDAP内容的调用时,内存使用情况会很好地稳定下来。此外,不断增加的对象是Net::BER::BerIdentifiedString和Net::BER::BerIdentifiedArray,它们都是LDAP库的一部分。当我运行导入时,内存使用量最终达到超过1GB的峰值。如果问题存在,我需要找到一些方法来更正我的代
我正在寻找执行以下操作的正确语法(在Perl、Shell或Ruby中):#variabletoaccessthedatalinesappendedasafileEND_OF_SCRIPT_MARKERrawdatastartshereanditcontinues. 最佳答案 Perl用__DATA__做这个:#!/usr/bin/perlusestrict;usewarnings;while(){print;}__DATA__Texttoprintgoeshere 关于ruby-如何将脚
在我的gem中,我需要yaml并且在我的本地计算机上运行良好。但是在将我的gem推送到rubygems.org之后,当我尝试使用我的gem时,我收到一条错误消息=>"uninitializedconstantPsych::Syck(NameError)"谁能帮我解决这个问题?附言RubyVersion=>ruby1.9.2,GemVersion=>1.6.2,Bundlerversion=>1.0.15 最佳答案 经过几个小时的研究,我发现=>“YAML使用未维护的Syck库,而Psych使用现代的LibYAML”因此,为了解决
好的,所以我的目标是轻松地将一些数据保存到磁盘以备后用。您如何简单地写入然后读取一个对象?所以如果我有一个简单的类classCattr_accessor:a,:bdefinitialize(a,b)@a,@b=a,bendend所以如果我从中非常快地制作一个objobj=C.new("foo","bar")#justgaveitsomerandomvalues然后我可以把它变成一个kindaidstring=obj.to_s#whichreturns""我终于可以将此字符串打印到文件或其他内容中。我的问题是,我该如何再次将这个id变回一个对象?我知道我可以自己挑选信息并制作一个接受该信
我在Rails工作并有以下类(class):classPlayer当我运行时bundleexecrailsconsole然后尝试:a=Player.new("me",5.0,"UCLA")我回来了:=>#我不知道为什么Player对象不会在这里初始化。关于可能导致此问题的操作/解释的任何建议?谢谢,马里奥格 最佳答案 havenoideawhythePlayerobjectwouldn'tbeinitializedhere它没有初始化很简单,因为你还没有初始化它!您已经覆盖了ActiveRecord::Base初始化方法,但您没有调
ruby如何管理内存。例如:如果我们在执行过程中采用C程序,则以下是内存模型。类似于这个ruby如何处理内存。C:__________________|||stack|||------------------||||------------------|||||Heap|||||__________________|||data|__________________|text|__________________Ruby:? 最佳答案 Ruby中没有“内存”这样的东西。Class#allocate分配一个对象并返回该对象。这就是程序
我有用于控制用户任务的Rails5API项目,我有以下错误,但并非总是针对相同的Controller和路由。ActionController::RoutingError:uninitializedconstantApi::V1::ApiController我向您描述了一些我的项目,以更详细地解释错误。应用结构路线scopemodule:'api'donamespace:v1do#=>Loginroutesscopemodule:'login'domatch'login',to:'sessions#login',as:'login',via::postend#=>Teamroutessc
我正在阅读一本关于Ruby的书,作者在编写类初始化定义时使用的形式与他在本书前几节中使用的形式略有不同。它看起来像这样:classTicketattr_accessor:venue,:datedefinitialize(venue,date)self.venue=venueself.date=dateendend在本书的前几节中,它的定义如下:classTicketattr_accessor:venue,:datedefinitialize(venue,date)@venue=venue@date=dateendend在第一个示例中使用setter方法与在第二个示例中使用实例变量之间是
无论您是想搭建桌面端、WEB端或者移动端APP应用,HOOPSPlatform组件都可以为您提供弹性的3D集成架构,同时,由工业领域3D技术专家组成的HOOPS技术团队也能为您提供技术支持服务。如果您的客户期望有一种在多个平台(桌面/WEB/APP,而且某些客户端是“瘦”客户端)快速、方便地将数据接入到3D应用系统的解决方案,并且当访问数据时,在各个平台上的性能和用户体验保持一致,HOOPSPlatform将帮助您完成。利用HOOPSPlatform,您可以开发在任何环境下的3D基础应用架构。HOOPSPlatform可以帮您打造3D创新型产品,HOOPSSDK包含的技术有:快速且准确的CAD
我想解析一个已经存在的.mid文件,改变它的乐器,例如从“acousticgrandpiano”到“violin”,然后将它保存回去或作为另一个.mid文件。根据我在文档中看到的内容,该乐器通过program_change或patch_change指令进行了更改,但我找不到任何在已经存在的MIDI文件中执行此操作的库.他们似乎都只支持从头开始创建的MIDI文件。 最佳答案 MIDIpackage会为您完成此操作,但具体方法取决于midi文件的原始内容。一个MIDI文件由一个或多个音轨组成,每个音轨是十六个channel中任何一个上的