Python源码示例:cupy.cuda.function.Module()
示例1
def load_sru_mod():
global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
global SRU_STREAM
if check_sru_requirement():
from cupy.cuda import function
from pynvrtc.compiler import Program
# This sets up device to use.
device = torch.device("cuda")
tmp_ = torch.rand(1, 1).to(device)
sru_prog = Program(SRU_CODE.encode('utf-8'),
'sru_prog.cu'.encode('utf-8'))
sru_ptx = sru_prog.compile()
sru_mod = function.Module()
sru_mod.load(bytes(sru_ptx.encode()))
SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')
stream = namedtuple('Stream', ['ptr'])
SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例2
def compile(self):
if self.ptx is None:
program = Program(kernel, 'relu.cu')
GPUReLUF.ptx = program.compile()
if torch.cuda.current_device() not in GPUReLUF.configured_gpus:
m = function.Module()
m.load(bytes(self.ptx))
self.relu_forward = m.get_function('relu_forward')
self.relu_backward = m.get_function('relu_backward')
Stream = namedtuple('Stream', ['ptr'])
self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
GPUReLUF.configured_gpus[torch.cuda.current_device()] = (self.relu_forward, self.relu_backward, self.stream)
self.relu_forward, self.relu_backward, self.stream = GPUReLUF.configured_gpus[torch.cuda.current_device()]
示例3
def load_sru_mod():
global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
global SRU_STREAM
if check_sru_requirement():
from cupy.cuda import function
from pynvrtc.compiler import Program
# This sets up device to use.
device = torch.device("cuda")
tmp_ = torch.rand(1, 1).to(device)
sru_prog = Program(SRU_CODE.encode('utf-8'),
'sru_prog.cu'.encode('utf-8'))
sru_ptx = sru_prog.compile()
sru_mod = function.Module()
sru_mod.load(bytes(sru_ptx.encode()))
SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')
stream = namedtuple('Stream', ['ptr'])
SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例4
def compile_functions(self):
device = torch.cuda.current_device()
print ("RRNN loaded for gpu {}".format(device))
mod = function.Module()
mod.load(bytes(self._RRNN_PTX.encode()))
if self.semiring.type == 0:
fwd_func = mod.get_function("rrnn_fwd")
bwd_func = mod.get_function("rrnn_bwd")
Stream = namedtuple("Stream", ["ptr"])
current_stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
self._DEVICE2FUNC[device] = (
current_stream, fwd_func, bwd_func,
)
return current_stream, fwd_func, bwd_func
else:
fwd_func = mod.get_function("rrnn_semiring_fwd")
bwd_func = mod.get_function("rrnn_semiring_bwd")
Stream = namedtuple("Stream", ["ptr"])
current_stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
self._DEVICE2FUNC[device] = (
current_stream, fwd_func, bwd_func
)
return current_stream, fwd_func, bwd_func
示例5
def load_sru_mod():
global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
global SRU_STREAM
if check_sru_requirement():
from cupy.cuda import function
from pynvrtc.compiler import Program
# This sets up device to use.
device = torch.device("cuda")
tmp_ = torch.rand(1, 1).to(device)
sru_prog = Program(SRU_CODE.encode('utf-8'),
'sru_prog.cu'.encode('utf-8'))
sru_ptx = sru_prog.compile()
sru_mod = function.Module()
sru_mod.load(bytes(sru_ptx.encode()))
SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')
stream = namedtuple('Stream', ['ptr'])
SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例6
def compile(self):
if self.ptx is None:
program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
GPUForgetMult.ptx = program.compile()
if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
m = function.Module()
m.load(bytes(self.ptx.encode()))
self.forget_mult = m.get_function('recurrent_forget_mult')
self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')
Stream = namedtuple('Stream', ['ptr'])
self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream)
self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
示例7
def compile(self):
if self.ptx is None:
program = Program(kernel.encode(),
'recurrent_forget_mult.cu'.encode())
GPUForgetMult.ptx = program.compile()
if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
m = function.Module()
m.load(bytes(self.ptx.encode()))
self.forget_mult = m.get_function('recurrent_forget_mult')
self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')
Stream = namedtuple('Stream', ['ptr'])
self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
self.forget_mult, self.bwd_forget_mult, self.stream)
self.forget_mult, self.bwd_forget_mult, self.stream = (
GPUForgetMult.configured_gpus[torch.cuda.current_device()])
示例8
def load_sru_mod():
global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
global SRU_STREAM
if check_sru_requirement():
from cupy.cuda import function
from pynvrtc.compiler import Program
# This sets up device to use.
device = torch.device("cuda")
tmp_ = torch.rand(1, 1).to(device)
sru_prog = Program(SRU_CODE.encode('utf-8'),
'sru_prog.cu'.encode('utf-8'))
sru_ptx = sru_prog.compile()
sru_mod = function.Module()
sru_mod.load(bytes(sru_ptx.encode()))
SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')
stream = namedtuple('Stream', ['ptr'])
SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例9
def load_sru_mod():
global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
global SRU_STREAM
if check_sru_requirement():
from cupy.cuda import function
from pynvrtc.compiler import Program
# This sets up device to use.
device = torch.device("cuda")
tmp_ = torch.rand(1, 1).to(device)
sru_prog = Program(SRU_CODE.encode('utf-8'),
'sru_prog.cu'.encode('utf-8'))
sru_ptx = sru_prog.compile()
sru_mod = function.Module()
sru_mod.load(bytes(sru_ptx.encode()))
SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')
stream = namedtuple('Stream', ['ptr'])
SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例10
def compile(self):
if self.ptx is None:
program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
GPUForgetMult.ptx = program.compile()
if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
m = function.Module()
m.load(bytes(self.ptx.encode()))
self.forget_mult = m.get_function('recurrent_forget_mult')
self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')
Stream = namedtuple('Stream', ['ptr'])
self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream)
self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
示例11
def compile(self):
# Create program
program = Program(self.kernel, self.title)
# Compile program
arch = "-arch={0}".format(cupyKernel.get_compute_arch())
ptx = program.compile([arch])
# Load Program
m = function.Module()
m.load(bytes(ptx.encode()))
# Get Function Pointer
self.func = m.get_function(self.func_name)
self.compiled = True
示例12
def __call__(self, input):
if not self.jit or not isinstance(input, torch.cuda.FloatTensor):
norm = input.norm(2, input.dim() - 1)
return torch.cat([norm, norm.new(norm.size()).zero_()], input.dim() - 1)
out = input.new(input.size())
input = input.contiguous()
if not iscomplex(input):
raise TypeError('The input and outputs should be complex')
if (self.modulus_cache[input.get_device()] is None):
kernel = b"""
extern "C"
__global__ void abs_complex_value(const float * x, float2 * z, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n)
return;
z[i] = make_float2(normf(2, x + 2*i), 0);
}
"""
print('modulus.cu')
prog = Program(kernel, b'modulus.cu')
ptx = prog.compile(['-arch='+get_compute_arch(input)])
module = Module()
module.load(bytes(ptx.encode()))
self.modulus_cache[input.get_device()] = module
fabs = self.modulus_cache[input.get_device()].get_function('abs_complex_value')
fabs(grid=(self.GET_BLOCKS(int(out.nelement())//2), 1, 1),
block=(self.CUDA_NUM_THREADS, 1, 1),
args=[input.data_ptr(), out.data_ptr(), out.numel() // 2],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return out
示例13
def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e):
# program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8'))
# ptx = program.compile(['-I/usr/local/cuda/include'.encode('utf-8')])
program = Program(src, 'best_local_affine_kernel.cu')
ptx = program.compile(['-I/usr/local/cuda/include'])
m = function.Module()
m.load(bytes(ptx.encode()))
_reconstruction_best_kernel = m.get_function('reconstruction_best_kernel')
_bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel')
_best_local_affine_kernel = m.get_function('best_local_affine_kernel')
Stream = namedtuple('Stream', ['ptr'])
s = Stream(ptr=torch.cuda.current_stream().cuda_stream)
filter_radius = f_r
sigma1 = filter_radius / 3
sigma2 = f_e
radius = (patch - 1) / 2
filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda()
affine_model = torch.zeros((h * w, 12)).cuda()
filtered_affine_model =torch.zeros((h * w, 12)).cuda()
input_ = torch.from_numpy(input_cpu).cuda()
output_ = torch.from_numpy(output_cpu).cuda()
_best_local_affine_kernel(
grid=(int((h * w) / 256 + 1), 1),
block=(256, 1, 1),
args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(),
np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s
)
_bilateral_smooth_kernel(
grid=(int((h * w) / 256 + 1), 1),
block=(256, 1, 1),
args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s
)
_reconstruction_best_kernel(
grid=(int((h * w) / 256 + 1), 1),
block=(256, 1, 1),
args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(),
np.int32(h), np.int32(w)], stream=s
)
numpy_filtered_best_output = filtered_best_output.cpu().numpy()
return numpy_filtered_best_output
示例14
def __call__(self, input, k):
out = input.new(input.size(0), input.size(1), input.size(2) // k, input.size(3) // k, 2)
if not self.jit or isinstance(input, (torch.FloatTensor, torch.DoubleTensor)):
y = input.view(input.size(0), input.size(1),
input.size(2)//out.size(2), out.size(2),
input.size(3)//out.size(3), out.size(3),
2)
out = y.mean(4).squeeze(4).mean(2).squeeze(2)
return out
if not iscomplex(input):
raise (TypeError('The input and outputs should be complex'))
input = input.contiguous()
if (self.periodize_cache[(input.size(), out.size(), input.get_device())] is None):
kernel = '''
#define NW ${W} / ${k}
#define NH ${H} / ${k}
extern "C"
__global__ void periodize(const ${Dtype}2 *input, ${Dtype}2 *output)
{
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tz = blockIdx.z * blockDim.z + threadIdx.z;
if(tx >= NW || ty >= NH || tz >= ${B})
return;
input += tz * ${H} * ${W} + ty * ${W} + tx;
${Dtype}2 res = make_${Dtype}2(0.f, 0.f);
for (int j=0; j<${k}; ++j)
for (int i=0; i<${k}; ++i)
{
const ${Dtype}2 &c = input[j * NH * ${W} + i * NW];
res.x += c.x;
res.y += c.y;
}
res.x /= ${k} * ${k};
res.y /= ${k} * ${k};
output[tz * NH * NW + ty * NW + tx] = res;
}
'''
B = input.nelement() // (2*input.size(-2) * input.size(-3))
W = input.size(-2)
H = input.size(-3)
k = input.size(-2) // out.size(-2)
kernel = Template(kernel).substitute(B=B, H=H, W=W, k=k, Dtype=getDtype(input))
name = str(input.get_device())+'-'+str(B)+'-'+str(k)+'-'+str(H)+'-'+str(W)+'-periodize.cu'
print(name)
prog = Program(kernel, name.encode())
ptx = prog.compile(['-arch='+get_compute_arch(input)])
module = Module()
module.load(bytes(ptx.encode()))
self.periodize_cache[(input.size(), out.size(), input.get_device())] = module
grid = (self.GET_BLOCKS(out.size(-3), self.block[0]),
self.GET_BLOCKS(out.size(-2), self.block[1]),
self.GET_BLOCKS(out.nelement() // (2*out.size(-2) * out.size(-3)), self.block[2]))
periodize = self.periodize_cache[(input.size(), out.size(), input.get_device())].get_function('periodize')
periodize(grid=grid, block=self.block, args=[input.data_ptr(), out.data_ptr()],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return out