Merge pull request #2 from li-xl/revert-1-JSparse/jkay
Revert "Change the architecture of the code and add setup.py"
This commit is contained in:
commit
836a2f55a5
|
@ -1,2 +0,0 @@
|
|||
*.ipynb
|
||||
__pycache__
|
|
@ -1,11 +1,3 @@
|
|||
# JSparse
|
||||
|
||||
JSparse is a high-performance auto-differentiation library for sparse tensor.
|
||||
|
||||
|
||||
## How to Install
|
||||
|
||||
```
|
||||
cd python
|
||||
python setup.py install
|
||||
```
|
|
@ -1,73 +0,0 @@
|
|||
from pickletools import optimize
|
||||
from statistics import mode
|
||||
import jittor
|
||||
from jittor import nn
|
||||
|
||||
import JSparse
|
||||
from JSparse import SparseTensor
|
||||
from JSparse import nn as spnn
|
||||
from JSparse.utils.quantize import sparse_quantize
|
||||
|
||||
import numpy as np
|
||||
|
||||
class RandomDataset(jittor.dataset.Dataset):
|
||||
|
||||
def __init__(self, input_size: int, voxel_size: float) -> None:
|
||||
super().__init__()
|
||||
self.set_attrs(total_len = input_size)
|
||||
self.voxel_size = voxel_size
|
||||
|
||||
def __getitem__(self, _: int):
|
||||
inputs = np.random.uniform(-100, 100, size=(self.total_len, 4))
|
||||
labels = np.random.choice(10, size=self.total_len)
|
||||
|
||||
coords, feats = inputs[:, :], inputs
|
||||
coords -= np.min(coords, axis=0, keepdims=True)
|
||||
# coords, indices = sparse_quantize(coords, self.voxel_size, return_index=True)
|
||||
|
||||
coords = jittor.Var(coords)
|
||||
feats = jittor.Var(feats)
|
||||
labels = jittor.Var(labels)
|
||||
# coords = jittor.Var(coords, dtype=jittor.int64)
|
||||
# feats = jittor.Var(feats[indices], dtype=jittor.float64)
|
||||
# labels = jittor.Var(labels[indices], dtype=jittor.int64)
|
||||
|
||||
print(type(coords))
|
||||
inputs = SparseTensor(coords, feats, 1, 1)
|
||||
labels = SparseTensor(coords, labels, 1, 1)
|
||||
return inputs, labels
|
||||
|
||||
if __name__ == '__main__':
|
||||
np.random.seed(0)
|
||||
|
||||
dataset = RandomDataset(input_size=10000, voxel_size=0.2)
|
||||
|
||||
model = nn.Sequential(
|
||||
spnn.Conv3d(4, 32, 3),
|
||||
spnn.BatchNorm(32),
|
||||
spnn.ReLU(True),
|
||||
spnn.Conv3d(32, 64, 2, stride=2),
|
||||
spnn.BatchNorm(64),
|
||||
spnn.ReLU(True),
|
||||
spnn.Conv3d(64, 64, 2, stride=2, transposed=True),
|
||||
spnn.BatchNorm(64),
|
||||
spnn.ReLU(True),
|
||||
spnn.Conv3d(64, 32, 3),
|
||||
spnn.BatchNorm(32),
|
||||
spnn.ReLU(True),
|
||||
spnn.Conv3d(32, 10, 1),
|
||||
)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = jittor.optim.Adam(model.parameters(), lr=1e-3)
|
||||
|
||||
model.train()
|
||||
lens = len(dataset)
|
||||
for batch_idx, (inputs, labels) in enumerate(dataset):
|
||||
outputs = model(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
optimizer.setp(loss)
|
||||
|
||||
if batch_idx % 10 == 0:
|
||||
print('Training: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
||||
batch_idx, lens , 100. * batch_idx / lens, loss.numpy()[0]))
|
|
@ -1 +0,0 @@
|
|||
from .sparse import *
|
|
@ -1,34 +0,0 @@
|
|||
import os
|
||||
import numpy as np
|
||||
from typing import Union, List, Tuple
|
||||
|
||||
import jittor as jt
|
||||
from jittor import Function
|
||||
|
||||
class IndiceManager:
|
||||
def __init__(
|
||||
self,
|
||||
ndim,
|
||||
# indice_map_type,
|
||||
# sparse_alorithm, # set m_hashtable_occupancy for concurrent_unordered_map
|
||||
):
|
||||
# if indice_map_type == 'GPU':
|
||||
# assert(jt.has_cuda)
|
||||
|
||||
self.ndim = ndim
|
||||
# self.indice_map_type = indice_map_type
|
||||
# self.sparse_algorithm = sparse_alorithm
|
||||
self.stride_key_manager = {}
|
||||
self.indice_map_manager = {}
|
||||
self.kernel_map_manager = {}
|
||||
|
||||
def insert(self, stride, indice_key, indice_hash):
|
||||
self.stride_key_manager[stride] = indice_key
|
||||
self.indice_map_manager[indice_key] = indice_hash
|
||||
|
||||
|
||||
# class IndiceMapManager
|
||||
|
||||
|
||||
|
||||
|
|
@ -1 +0,0 @@
|
|||
from .modules import *
|
|
@ -1,10 +0,0 @@
|
|||
from .activation import *
|
||||
from .conv import *
|
||||
from .count import *
|
||||
# from .crop import *
|
||||
from .devoxelize import *
|
||||
from .downsample import *
|
||||
from .hash import *
|
||||
from .pooling import *
|
||||
from .query import *
|
||||
from .voxelize import *
|
|
@ -1,24 +0,0 @@
|
|||
import jittor as jt
|
||||
import jittor.nn as nn
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn.utils import fapply
|
||||
|
||||
__all__ = ['relu', 'leaky_relu']
|
||||
# __all__ = ['relu', 'leaky_relu', 'ReLU', 'LeakyReLU']
|
||||
|
||||
def relu(input: SparseTensor) -> SparseTensor:
|
||||
return fapply(input, nn.relu)
|
||||
|
||||
|
||||
def leaky_relu(input: SparseTensor,
|
||||
scale: float = 0.01) -> SparseTensor:
|
||||
return fapply(input,
|
||||
nn.leaky_relu,
|
||||
scale=scale)
|
||||
|
||||
# Relu = jt.make_module(relu)
|
||||
# ReLU = Relu
|
||||
# Leaky_relu = jt.make_module(leaky_relu, 2)
|
||||
# LeakyReLU = Leaky_relu
|
||||
|
|
@ -1,351 +0,0 @@
|
|||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import jittor as jt
|
||||
from jittor import Function
|
||||
from jittor.misc import _pair, _triple
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn import functional as F
|
||||
from JSparse.nn.utils import get_kernel_offsets
|
||||
from JSparse import make_ntuple
|
||||
|
||||
__all__ = ['conv3d', 'Convolution']
|
||||
|
||||
class Convolution(Function):
|
||||
def execute(
|
||||
self,
|
||||
input: jt.Var,
|
||||
weight: jt.Var,
|
||||
nbmaps: jt.Var,
|
||||
nbsizes: jt.Var,
|
||||
sizes: Tuple[int, int],
|
||||
transposed: bool = False,
|
||||
) -> jt.Var:
|
||||
if not transposed:
|
||||
output = jt.zeros((sizes[1], weight.size(-1)))
|
||||
else:
|
||||
output = jt.zeros((sizes[0], weight.size(-1)))
|
||||
|
||||
assert input.size(1) == weight.size(1)
|
||||
in_size = input.size(0)
|
||||
in_channels = input.size(1)
|
||||
out_size = output.size(0)
|
||||
out_channels = output.size(1)
|
||||
|
||||
kernel_volume = weight.size(0)
|
||||
flag = False
|
||||
mid = kernel_volume // 2
|
||||
in_buffer_size = jt.Var(1)
|
||||
if kernel_volume % 2 and out_size == in_size:
|
||||
flag = True
|
||||
in_buffer_size = max(in_buffer_size, jt.max(nbsizes[:mid]))
|
||||
in_buffer_size = max(in_buffer_size, jt.max(nbsizes[mid + 1:]))
|
||||
output = jt.matmul(input, weight[mid, :, :])
|
||||
else:
|
||||
in_buffer_size = jt.max(nbsizes)
|
||||
|
||||
# in_buffer_activated : in_buffer_size * in_channels
|
||||
# weight : in_channels * out_channels
|
||||
# out_buffer_activated: in_buffer_size * out_channels
|
||||
# out_buffer_activated = in_buffer_activated * weight
|
||||
in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [in_channels])
|
||||
cur_offset = jt.Var(0)
|
||||
for i in range(kernel_volume):
|
||||
n_active_feats = nbsizes[i]
|
||||
if (flag and (i == mid)):
|
||||
cur_offset += n_active_feats
|
||||
continue
|
||||
if n_active_feats == 0:
|
||||
continue
|
||||
|
||||
t = 1 if transposed else 0
|
||||
################
|
||||
## gather
|
||||
################
|
||||
# print(n_active_feats, in_channels, cur_offset, t)
|
||||
gather(n_active_feats, in_channels, cur_offset, t, input, in_buffer_activated, nbmaps)
|
||||
################
|
||||
## matmul
|
||||
################
|
||||
out_buffer_activated = jt.matmul(in_buffer_activated, weight[i, :, :])
|
||||
################
|
||||
## scatter
|
||||
################
|
||||
scatter(n_active_feats, out_channels, cur_offset, t, out_buffer_activated, output, nbmaps)
|
||||
cur_offset += n_active_feats
|
||||
self.save_vars = input, weight, nbmaps, nbsizes, transposed
|
||||
return output
|
||||
|
||||
def grad(
|
||||
self,
|
||||
grad_output: jt.Var
|
||||
) -> Tuple[Optional[jt.Var], ...]:
|
||||
input, weight, nbmaps, nbsizes, transposed = self.save_vars
|
||||
|
||||
grad_input = jt.zeros_like(input)
|
||||
grad_weight = jt.zeros_like(weight)
|
||||
|
||||
# n_in_feats = input.size(0)
|
||||
# n_out_feats = grad_output.size(0)
|
||||
n_in_channels = input.size(1)
|
||||
n_out_channels = weight.size(-1)
|
||||
|
||||
kernel_volume = weight.size(0)
|
||||
flag = False
|
||||
in_buffer_size = jt.max(nbsizes)
|
||||
# out_grad_buffer_activated n_active_feats x n_out_channels
|
||||
# in_grad_buffer_activated n_active_feats x n_in_channels
|
||||
# in_buffer_activated n_active_feats x n_in_channels
|
||||
out_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_out_channels])
|
||||
in_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
|
||||
in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
|
||||
|
||||
cur_offset = jt.Var(0)
|
||||
mid = kernel_volume // 2
|
||||
for i in range(kernel_volume):
|
||||
# kernel_grad_buffer = grad_weight[i, :, :]
|
||||
n_active_feats = nbsizes[i]
|
||||
# if flag and (i == mid):
|
||||
# cur_offset += n_active_feats
|
||||
# continue
|
||||
|
||||
if n_active_feats == 0:
|
||||
continue
|
||||
|
||||
t = 1 if transposed else 0
|
||||
################
|
||||
## gather
|
||||
################
|
||||
gather(n_active_feats, n_out_channels, cur_offset, 1 - t, grad_output, out_grad_buffer_activated, nbmaps)
|
||||
gather(n_active_feats, n_in_channels , cur_offset, t , input , in_buffer_activated , nbmaps)
|
||||
################
|
||||
## matmul
|
||||
################
|
||||
# grad for input
|
||||
# in_grad_buffer_activated = out_grad_buffer_activated @ weight^T
|
||||
# n_active_feats x n_in_channels n_active_feats x n_out_channels n_out_channels x n_in_channels
|
||||
in_grad_buffer_activated = jt.nn.matmul_transpose(out_grad_buffer_activated, weight[i, :, :])
|
||||
# grad for weight
|
||||
# kernel_grad_buffer = in_buffer_activated^T @ out_grad_buffer_activated
|
||||
# n_in_channels x n_out_channels n_in_channels x n_active_feats n_active_feats x n_out_channels
|
||||
grad_weight[i, :, :] = jt.nn.matmul(in_buffer_activated.t(), out_grad_buffer_activated)
|
||||
################
|
||||
## scatter
|
||||
################
|
||||
scatter(n_active_feats, n_in_channels, cur_offset, 1 - t, in_grad_buffer_activated, grad_input, nbmaps)
|
||||
cur_offset += n_active_feats
|
||||
return grad_input, grad_weight, None, None, None, None
|
||||
|
||||
def gather(
|
||||
n_active_feats,
|
||||
channels,
|
||||
cur_offset,
|
||||
transpose,
|
||||
in_feat,
|
||||
out_feat,
|
||||
kmap,
|
||||
):
|
||||
shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
|
||||
gather_args = jt.zeros(shape, dtype='int32')
|
||||
return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, gather_args],
|
||||
cuda_header="""
|
||||
@alias(in_feat, in0)
|
||||
@alias(out_feat, in1)
|
||||
@alias(kmap, in2)
|
||||
@alias(args, in3)
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ void gather_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
const int n_k = args_shape0;
|
||||
const int st = args_shape1;
|
||||
const int c = args_shape2;
|
||||
const int transpose = args_shape3;
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int i = idx / c;
|
||||
int j = idx % c;
|
||||
if (i >= n_k) return;
|
||||
int in_pos = @kmap(st + i, transpose);
|
||||
// if (in_pos < 0) return;
|
||||
@out_feat(i, j) = @in_feat(in_pos, j);
|
||||
}
|
||||
|
||||
gather_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
|
||||
""",
|
||||
cpu_header="""
|
||||
@alias(in_feat, in0)
|
||||
@alias(out_feat, in1)
|
||||
@alias(kmap, in2)
|
||||
@alias(args, in3)
|
||||
""",
|
||||
cpu_src="""
|
||||
const int n_k = args_shape0;
|
||||
const int st = args_shape1;
|
||||
const int c = args_shape2;
|
||||
const int transpose = args_shape3;
|
||||
|
||||
for (int i = 0; i < n_k; ++ i ) {
|
||||
int in_pos = @kmap(st + i, transpose);
|
||||
// if (in_pos < 0) {
|
||||
// continue;
|
||||
// }
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < c; ++ j ) {
|
||||
@out_feat(i, j) = @in_feat(in_pos, j);
|
||||
// @out(i, j) = @in_feat(in_pos, j);
|
||||
}
|
||||
}
|
||||
"""
|
||||
).sync()
|
||||
|
||||
|
||||
def scatter(
|
||||
n_active_feats,
|
||||
channels,
|
||||
cur_offset,
|
||||
transpose,
|
||||
in_feat,
|
||||
out_feat,
|
||||
kmap,
|
||||
):
|
||||
shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
|
||||
scatter_args = jt.zeros(shape, dtype='int32')
|
||||
return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, scatter_args],
|
||||
cuda_header="""
|
||||
@alias(in_feat, in0)
|
||||
@alias(out_feat, in1)
|
||||
@alias(kmap, in2)
|
||||
@alias(args, in3)
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ void scatter_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
const int n_k = args_shape0;
|
||||
const int st = args_shape1;
|
||||
const int c = args_shape2;
|
||||
const int transpose = args_shape3;
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int i = idx / c;
|
||||
int j = idx % c;
|
||||
if (i >= n_k) return;
|
||||
int out_pos = @kmap(st + i, 1 - transpose);
|
||||
// if (out_pos < 0) return;
|
||||
@out_feat(out_pos, j) += @in_feat(i, j);
|
||||
}
|
||||
|
||||
scatter_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
|
||||
""",
|
||||
cpu_header="""
|
||||
@alias(in_feat, in0)
|
||||
@alias(out_feat, in1)
|
||||
@alias(kmap, in2)
|
||||
@alias(args, in3)
|
||||
""",
|
||||
cpu_src="""
|
||||
const int n_k = args_shape0;
|
||||
const int st = args_shape1;
|
||||
const int c = args_shape2;
|
||||
const int transpose = args_shape3;
|
||||
|
||||
for (int i = 0; i < n_k; ++ i ) {
|
||||
int out_pos = @kmap(st + i, 1 - transpose);
|
||||
// if (out_pos < 0) {
|
||||
// continue;
|
||||
// }
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < c; ++ j ) {
|
||||
@out_feat(out_pos, j) += @in_feat(i, j);
|
||||
// @out(out_pos, j) += @in_feat(i, j);
|
||||
}
|
||||
}
|
||||
"""
|
||||
).sync()
|
||||
|
||||
def conv3d(
|
||||
input: SparseTensor,
|
||||
weight: jt.Var,
|
||||
kernel_size: Union[int, Tuple[int, ...]],
|
||||
bias: Optional[jt.Var] = None,
|
||||
stride: Union[int, Tuple[int, ...]] = 1,
|
||||
dilation: Union[int, Tuple[int, ...]] = 1,
|
||||
group: int = 1,
|
||||
transposed: bool = False,
|
||||
) -> SparseTensor:
|
||||
# kernel_size = make_ntuple(kernel_size, ndim=3)
|
||||
# stride = make_ntuple(stride, ndim=3)
|
||||
# dilation = make_ntuple(dilation, ndim=3)
|
||||
kernel_size = _triple(kernel_size)
|
||||
stride = _triple(stride)
|
||||
dilation = _triple(dilation)
|
||||
|
||||
if (kernel_size == _triple(1) and stride == _triple(1) and dilation == _triple(1)):
|
||||
output_stride = input.stride
|
||||
output_indices = input.indices
|
||||
output_values = input.values.matmul(weight)
|
||||
elif not transposed:
|
||||
output_stride = tuple(input.stride[k] * stride[k] for k in range(3))
|
||||
|
||||
if output_stride in input.cmaps:
|
||||
output_indices = input.cmaps[output_stride]
|
||||
elif all(stride[k] == 1 for k in range(3)):
|
||||
output_indices = input.indices
|
||||
else:
|
||||
output_indices = F.spdownsample(
|
||||
input.indices,
|
||||
stride,
|
||||
kernel_size,
|
||||
input.stride,
|
||||
)
|
||||
|
||||
if (input.stride, kernel_size, stride, dilation) not in input.kmaps:
|
||||
offsets = get_kernel_offsets(
|
||||
kernel_size,
|
||||
stride=input.stride,
|
||||
dilation=dilation,
|
||||
)
|
||||
references = F.sphash(input.indices) # (N,)
|
||||
queries = F.sphash(output_indices, offsets) # (|K|, N)
|
||||
results = F.spquery(queries, references) # (|K|, N)
|
||||
|
||||
nbsizes = jt.sum(results != -1, dim=1)
|
||||
nbmaps = jt.nonzero(results != -1)
|
||||
|
||||
indices = nbmaps[:, 0] * results.size(1) + nbmaps[:, 1]
|
||||
nbmaps[:, 0] = results.view(-1)[indices]
|
||||
|
||||
input.kmaps[(input.stride, kernel_size, stride, dilation)] = [
|
||||
nbmaps, nbsizes, (input.indices.shape[0], output_indices.shape[0])
|
||||
]
|
||||
|
||||
output_values = Convolution.apply(
|
||||
input.values,
|
||||
weight,
|
||||
*input.kmaps[(input.stride, kernel_size, stride, dilation)],
|
||||
transposed,
|
||||
)
|
||||
else:
|
||||
output_stride = tuple(input.stride[k] // stride[k] for k in range(3))
|
||||
output_indices = input.cmaps[output_stride]
|
||||
output_values = Convolution.apply(
|
||||
input.values,
|
||||
weight,
|
||||
*input.kmaps[(output_stride, kernel_size, stride, dilation)],
|
||||
transposed,
|
||||
)
|
||||
|
||||
if bias is not None:
|
||||
output_values += bias
|
||||
|
||||
# size have to be set
|
||||
output = SparseTensor(
|
||||
indices=output_indices,
|
||||
values=output_values,
|
||||
stride=output_stride,
|
||||
size=input.size
|
||||
)
|
||||
output.cmaps = input.cmaps
|
||||
output.cmaps.setdefault(output_stride, output_indices)
|
||||
output.kmaps = input.kmaps
|
||||
return output
|
|
@ -1,30 +0,0 @@
|
|||
import jittor as jt
|
||||
|
||||
def spcount(idx_query: jt.Var, num: int) -> jt.Var:
|
||||
return jt.code((num,), jt.int32, [idx_query],
|
||||
cuda_src="""
|
||||
__global__ void count_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
@alias(idx_query, in0)
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int cur_idx = @idx_query(i);
|
||||
if (i < idx_query_shape0 && cur_idx >= 0) {
|
||||
atomicAdd(out_p + cur_idx, 1);
|
||||
}
|
||||
}
|
||||
@alias(idx_query, in0)
|
||||
count_kernel<<<(idx_query_shape0 + 511) / 512, 512>>>(@ARGS);
|
||||
""",
|
||||
cpu_src="""
|
||||
@alias(idx_query, in0)
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < idx_query_shape0; ++ i ) {
|
||||
int cur_idx = @idx_query(i);
|
||||
if (cur_idx < 0) {
|
||||
continue;
|
||||
}
|
||||
#pragma omp atomic
|
||||
@out(cur_idx) ++;
|
||||
}
|
||||
"""
|
||||
)
|
|
@ -1,166 +0,0 @@
|
|||
import jittor as jt
|
||||
from jittor import Function
|
||||
|
||||
from JSparse import SparseTensor
|
||||
|
||||
__all__ = ['calc_ti_weights', 'spdevoxelize']
|
||||
|
||||
def calc_ti_weights(
|
||||
indices: jt.Var,
|
||||
idx_query: jt.Var,
|
||||
scale: float = 1
|
||||
) -> jt.Var:
|
||||
with jt.no_grad():
|
||||
p = indices
|
||||
if scale != 1:
|
||||
pf = jt.floor(indices / scale) * scale
|
||||
else:
|
||||
pf = jt.floor(indices)
|
||||
pc = pf + scale
|
||||
|
||||
x = p[:, 1].view(-1, 1)
|
||||
y = p[:, 2].view(-1, 1)
|
||||
z = p[:, 3].view(-1, 1)
|
||||
|
||||
xf = pf[:, 1].view(-1, 1).float()
|
||||
yf = pf[:, 2].view(-1, 1).float()
|
||||
zf = pf[:, 3].view(-1, 1).float()
|
||||
|
||||
xc = pc[:, 1].view(-1, 1).float()
|
||||
yc = pc[:, 2].view(-1, 1).float()
|
||||
zc = pc[:, 3].view(-1, 1).float()
|
||||
|
||||
w0 = (xc - x) * (yc - y) * (zc - z)
|
||||
w1 = (xc - x) * (yc - y) * (z - zf)
|
||||
w2 = (xc - x) * (y - yf) * (zc - z)
|
||||
w3 = (xc - x) * (y - yf) * (z - zf)
|
||||
w4 = (x - xf) * (yc - y) * (zc - z)
|
||||
w5 = (x - xf) * (yc - y) * (z - zf)
|
||||
w6 = (x - xf) * (y - yf) * (zc - z)
|
||||
w7 = (x - xf) * (y - yf) * (z - zf)
|
||||
|
||||
w = jt.concat([w0, w1, w2, w3, w4, w5, w6, w7], dim=1).t()
|
||||
if scale != 1:
|
||||
w /= scale ** 3
|
||||
w[idx_query == -1] = 0
|
||||
w /= jt.sum(w, dim=0) + 1e-8
|
||||
return w
|
||||
|
||||
|
||||
class Devoxelize(Function):
|
||||
def execute(
|
||||
self,
|
||||
values: jt.Var,
|
||||
idx_query: jt.Var,
|
||||
weights: jt.Var
|
||||
) -> jt.Var:
|
||||
# c = values_shape1
|
||||
# N = idx_query_shape0
|
||||
output = jt.code((idx_query.shape[0], values.shape[1]), jt.float32, [values, idx_query, weights],
|
||||
cuda_src="""
|
||||
__global__ void devoxelize_forward_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
@alias(values, in0)
|
||||
@alias(idx_query, in1)
|
||||
@alias(weights, in2)
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int i = index / values_shape1;
|
||||
int j = index % values_shape1;
|
||||
|
||||
if (i < idx_query_shape0) {
|
||||
float cur_values = 0;
|
||||
for (int k = 0; k < 8; ++ k ) {
|
||||
int idx = @idx_query(i, k);
|
||||
cur_values = (idx >= 0) ? @values(idx, j) : 0;
|
||||
@out(i, j) += @weights(i, k) * cur_values;
|
||||
}
|
||||
}
|
||||
}
|
||||
devoxelize_forward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
|
||||
""",
|
||||
cpu_src="""
|
||||
@alias(values, in0)
|
||||
@alias(idx_query, in1)
|
||||
@alias(weights, in2)
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < idx_query_shape0; ++ i ) {
|
||||
for (int j = 0; j < values_shape1; ++ j ) {
|
||||
float cur_values = 0;
|
||||
for (int k = 0; k < 8; ++ k ) {
|
||||
int idx = @idx_query(i, k);
|
||||
cur_values = (idx >= 0) ? @values(idx, j) : 0;
|
||||
#pragma omp atomic
|
||||
@out(i ,j) += @weights(i, k) * cur_values;
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
)
|
||||
self.save_vars = (idx_query, weights, values.shape[0])
|
||||
return output
|
||||
|
||||
def grad(self, grad_output: jt.Var):
|
||||
idx_query, weights, input_size = self.save_vars
|
||||
|
||||
grad_values = jt.code((input_size, grad_output.shape[0]), jt.float, [idx_query, weights, grad_output],
|
||||
cuda_header="""
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ void devoxelize_backward_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
@alias(idx_query, in0)
|
||||
@alias(weights, in1)
|
||||
@alias(grad_output, in2)
|
||||
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int c = grad_output_shape1;
|
||||
int i = index / c;
|
||||
int j = index % c;
|
||||
|
||||
if (i < grad_output_shape0) {
|
||||
float cur_grad_output = @grad_output(i, j);
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 8; ++ k ) {
|
||||
int idx = @idx_query(i, k);
|
||||
if (idx >= 0) {
|
||||
atomicAdd(&@out(idx, j), @weights(i, k) * cur_grad_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@alias(grad_output, in2)
|
||||
devoxelize_backward_kernel<<<grad_output_shape0, grad_output_shape1>>>(@ARGS);
|
||||
""",
|
||||
cpu_src="""
|
||||
@alias(idx_query, in0)
|
||||
@alias(weights, in1)
|
||||
@alias(grad_output, in2)
|
||||
|
||||
for (int i = 0; i < grad_output_shape0; ++ i ) {
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < grad_output_shape1; ++ j ) {
|
||||
float cur_grad_output = 0;
|
||||
for (int k = 0; k < 8; ++ k ) {
|
||||
int idx = @idx_query(i, k);
|
||||
cur_grad_output = (idx >= 0) ? @grad_output(i, j) : 0;
|
||||
#pragma omp atomic
|
||||
@out(idx, j) += @weights(i, k) * cur_grad_output;
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
)
|
||||
return grad_values, None, None
|
||||
|
||||
def spdevoxelize(
|
||||
values: jt.Var,
|
||||
idx_query: jt.Var,
|
||||
weights: jt.Var
|
||||
) -> jt.Var:
|
||||
return Devoxelize.apply(values, idx_query, weights)
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
from typing import Tuple, Union
|
||||
|
||||
import jittor as jt
|
||||
from jittor.misc import _pair, _triple
|
||||
|
||||
from JSparse.nn.utils import get_kernel_offsets
|
||||
from JSparse.utils import make_ntuple, trunc
|
||||
|
||||
__all__ = ['spdownsample']
|
||||
|
||||
def spdownsample(
|
||||
indices: jt.Var,
|
||||
stride: Union[int, Tuple[int, ...]] = 2,
|
||||
kernel_size: Union[int, Tuple[int, ...]] = 2,
|
||||
tensor_stride: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
|
||||
# stride = make_ntuple(stride, ndim=3)
|
||||
# kernel_size = make_ntuple(kernel_size, ndim=3)
|
||||
# tensor_stride = make_ntuple(tensor_stride, ndim=3)
|
||||
kernel_size = _triple(kernel_size)
|
||||
stride = _triple(stride)
|
||||
tensor_stride = _triple(tensor_stride)
|
||||
|
||||
sample_stride = [stride[k] * tensor_stride[k] for k in range(3)]
|
||||
sample_stride = jt.Var(sample_stride,
|
||||
dtype='int32').unsqueeze(dim=0)
|
||||
|
||||
if all(stride[k] in [1, kernel_size[k]] for k in range(3)):
|
||||
indices = indices.clone()
|
||||
indices[:, 1:] = trunc(jt.divide(indices[:, 1:], sample_stride)) * sample_stride
|
||||
else:
|
||||
offsets = get_kernel_offsets(kernel_size,
|
||||
tensor_stride)
|
||||
kernel_volume = offsets.size(0)
|
||||
|
||||
indices_min = indices[:, :3].min(dim=0, keepdims=True)
|
||||
|
||||
b = indices[:, :1].repeat(1, kernel_volume)
|
||||
x = indices[:, 1:].unsqueeze(dim=1).repeat(1, kernel_volume, 1) + offsets
|
||||
indices = jt.cat([b.view(-1, 1), x.view(-1, 3)], dim=1)
|
||||
|
||||
# TODO: We need to also filter `indices` based on `indices_max`.
|
||||
mask = (indices[:, 1:] % sample_stride == 0)
|
||||
mask &= (indices[:, 1:] >= indices_min)
|
||||
mask = jt.all(mask, dim=1)
|
||||
indices = indices[mask]
|
||||
|
||||
# we may have to unique the indices when we define the SparesTensor
|
||||
# indices = indices[:, [3, 0, 1, 2]]
|
||||
# indices = jt.unique(indices, dim=0)
|
||||
# indices = indices[:, [1, 2, 3, 0]]
|
||||
return indices
|
|
@ -1,204 +0,0 @@
|
|||
from typing import Optional
|
||||
|
||||
import jittor as jt
|
||||
|
||||
__all__ = ['sphash']
|
||||
|
||||
def sphash(indices: jt.Var,
|
||||
offsets: Optional[jt.Var] = None) -> jt.Var:
|
||||
assert indices.dtype == jt.int, indices.dtype
|
||||
assert indices.ndim == 2 and indices.shape[1] == 4, indices.shape
|
||||
|
||||
if offsets is None:
|
||||
return jt.code((indices.shape[0],), jt.int64, [indices],
|
||||
cuda_header="""
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
@alias(indices, in0)
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ static void hash_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < indices_shape0) {
|
||||
uint64_t hash = 14695981039346656037UL;
|
||||
for (int j = 0; j < 4; ++ j ) {
|
||||
hash ^= (unsigned int)@indices(i, j);
|
||||
hash *= 1099511628211UL;
|
||||
}
|
||||
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
@out(i) = hash;
|
||||
}
|
||||
}
|
||||
hash_kernel<<<ceil((double)indices_shape0 / 512), 512>>>(@ARGS);
|
||||
""",
|
||||
cpu_header="""
|
||||
#include <vector>
|
||||
@alias(indices, in0)
|
||||
""",
|
||||
cpu_src="""
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < indices_shape0; ++ i ) {
|
||||
uint64_t hash = 14695981039346656037UL;
|
||||
for (int j = 0; j < 4; ++ j ) {
|
||||
hash ^= (unsigned int)@indices(i, j);
|
||||
hash *= 1099511628211UL;
|
||||
}
|
||||
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
@out(i) = hash;
|
||||
}
|
||||
""")
|
||||
else:
|
||||
assert offsets.dtype == jt.int32, offsets.dtype
|
||||
assert offsets.ndim == 2 and offsets.shape[1] == 3, offsets.shape
|
||||
|
||||
return jt.code((offsets.shape[0], indices.shape[0]), jt.int64, [indices, offsets],
|
||||
cuda_header="""
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
@alias(indices, in0)
|
||||
@alias(offsets, in1)
|
||||
""",
|
||||
# cuda_src="""
|
||||
# __global__ void kernel_hash_kernel(@ARGS_DEF){
|
||||
# @PRECALC
|
||||
# extern __shared__ int offsets_shared[];
|
||||
|
||||
# int ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
# int iy = blockIdx.y;
|
||||
|
||||
# // if (!threadIdx.x) {
|
||||
# // for (int j = 0; j < 3; ++ j ) {
|
||||
# // offsets_shared[iy * 3 + j] = @offsets(iy, j);
|
||||
# // }
|
||||
# // }
|
||||
# // __syncthreads();
|
||||
|
||||
# if (!threadIdx.x) {
|
||||
# for (int j = 0; j < 3; ++ j ) {
|
||||
# offsets_shared[iy * 3 + j] = @offsets(iy, j);
|
||||
# }
|
||||
# }
|
||||
# __syncthreads();
|
||||
|
||||
# if (ix < indices_shape0 && iy < offsets_shape0) {
|
||||
# int cur_indices[4];
|
||||
# for (int j = 1; j <= 3; ++ j ) {
|
||||
# // cur_indices[j] = @indices(ix, j) + @offsets(iy, j - 1);
|
||||
# cur_indices[j] = @indices(ix, j) + offsets_shared[iy * 3 + j - 1];
|
||||
# }
|
||||
# cur_indices[0] = @indices(ix, 0);
|
||||
# uint64_t hash = 14695981039346656037UL;
|
||||
# for (int j = 0; j < 4; ++ j ) {
|
||||
# hash ^= (unsigned int)cur_indices[j];
|
||||
# hash *= 1099511628211UL;
|
||||
# }
|
||||
# hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
# @out0(iy, ix) = hash;
|
||||
# }
|
||||
# }
|
||||
# dim3 block(512);
|
||||
# dim3 grid((indices_shape0 + block.x - 1) / block.x, offsets_shape0);
|
||||
# kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
|
||||
# """,
|
||||
# cuda_src="""
|
||||
# __global__ void kernel_hash_kernel(@ARGS_DEF){
|
||||
# @PRECALC
|
||||
# extern __shared__ int offsets_shared[];
|
||||
|
||||
# int ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
# int iy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
# // if (!threadIdx.x) {
|
||||
# // for (int j = 0; j < 3; ++ j ) {
|
||||
# // offsets_shared[iy * 3 + j] = @offsets(iy, j);
|
||||
# // }
|
||||
# // }
|
||||
# // __syncthreads();
|
||||
|
||||
# if (iy < indices_shape0 && ix < offsets_shape0) {
|
||||
# int cur_indices[4];
|
||||
# // for (int j = 1; j <= 3; ++ j ) {
|
||||
# // cur_indices[j] = @indices(iy, j) + @offsets(ix, j - 1);
|
||||
# // cur_indices[j] = @indices(iy, j) + offsets_shared[ix * 3 + j - 1];
|
||||
# // }
|
||||
# cur_indices[0] = @indices(iy, 0);
|
||||
# cur_indices[1] = @indices(iy, 1) + @offsets(ix, 0);
|
||||
# cur_indices[2] = @indices(iy, 2) + @offsets(ix, 1);
|
||||
# cur_indices[3] = @indices(iy, 3) + @offsets(ix, 2);
|
||||
# uint64_t hash = 14695981039346656037UL;
|
||||
# for (int j = 0; j < 4; ++ j ) {
|
||||
# hash ^= (unsigned int)cur_indices[j];
|
||||
# hash *= 1099511628211UL;
|
||||
# }
|
||||
# hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
# @out0(ix, iy) = hash;
|
||||
# }
|
||||
# }
|
||||
# dim3 block(16, 64);
|
||||
# dim3 grid((offsets_shape0 + block.x - 1) / block.x), (indices_shape0 + block.y - 1) / block.y);
|
||||
# kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
|
||||
# """,
|
||||
cuda_src="""
|
||||
__global__ void kernel_hash_kernel(@ARGS_DEF){
|
||||
@PRECALC
|
||||
extern __shared__ int offsets_shared[];
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int k = idx % offsets_shape0;
|
||||
int i = idx / offsets_shape0;
|
||||
|
||||
if (i < indices_shape0) {
|
||||
int cur_indices[4];
|
||||
for (int j = 1; j <= 3; ++ j ) {
|
||||
cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
|
||||
}
|
||||
cur_indices[0] = @indices(i, 0);
|
||||
|
||||
uint64_t hash = 14695981039346656037UL;
|
||||
for (int j = 0; j < 4; ++ j ) {
|
||||
hash ^= (unsigned int)cur_indices[j];
|
||||
hash *= 1099511628211UL;
|
||||
}
|
||||
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
@out(k, i) = hash;
|
||||
}
|
||||
}
|
||||
int thread_nums = 512;
|
||||
kernel_hash_kernel<<<ceil((double)(indices_shape0 * offsets_shape0) / thread_nums), thread_nums, offsets_shape0 * 3 * sizeof(int)>>>(@ARGS);
|
||||
""",
|
||||
cpu_header="""
|
||||
#include <vector>
|
||||
@alias(indices, in0)
|
||||
@alias(offsets, in1)
|
||||
""",
|
||||
cpu_src="""
|
||||
auto K = offsets_shape0;
|
||||
auto N = indices_shape0;
|
||||
|
||||
for (int k = 0; k < offsets_shape0; ++ k ) {
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < indices_shape0; ++ i ) {
|
||||
int cur_indices[4];
|
||||
for (int j = 1; j <= 3; ++ j ) {
|
||||
cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
|
||||
}
|
||||
cur_indices[0] = @indices(i, 0);
|
||||
uint64_t hash = 14695981039346656037UL;
|
||||
for (int j = 0; j < 4; ++ j ) {
|
||||
hash ^= (unsigned int)cur_indices[j];
|
||||
hash *= 1099511628211UL;
|
||||
}
|
||||
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
|
||||
@out(k, i) = hash;
|
||||
}
|
||||
}
|
||||
"""
|
||||
)
|
|
@ -1,26 +0,0 @@
|
|||
import jittor as jt
|
||||
|
||||
from JSparse import SparseTensor
|
||||
|
||||
__all__ = ['global_avg_pool', 'global_max_pool']
|
||||
|
||||
def global_avg_pool(inputs: SparseTensor) -> jt.Var:
|
||||
batch_size = jt.max(inputs.indices[:, 0]).item() + 1
|
||||
outputs = []
|
||||
for k in range(batch_size):
|
||||
input = inputs.values[inputs.indices[:, 0] == k]
|
||||
output = jt.mean(input, dim=0)
|
||||
outputs.append(output)
|
||||
outputs = jt.stack(outputs, dim=0)
|
||||
return outputs
|
||||
|
||||
|
||||
def global_max_pool(inputs: SparseTensor) -> jt.Var:
|
||||
batch_size = jt.max(inputs.indices[:, 0]).item() + 1
|
||||
outputs = []
|
||||
for k in range(batch_size):
|
||||
input = inputs.values[inputs.indices[:, 0] == k]
|
||||
output = jt.max(input, dim=0)[0]
|
||||
outputs.append(output)
|
||||
outputs = jt.stack(outputs, dim=0)
|
||||
return outputs
|
|
@ -1,454 +0,0 @@
|
|||
import jittor as jt
|
||||
import math
|
||||
|
||||
__all__ = ['spquery']
|
||||
|
||||
def spquery(queries: jt.Var,
|
||||
references: jt.Var) -> jt.Var:
|
||||
q_size = queries.size()
|
||||
queries = queries.view(-1)
|
||||
|
||||
indices = jt.arange(len(references), dtype=jt.int64)
|
||||
|
||||
if jt.flags.use_cuda > 0:
|
||||
n = references.shape[0]
|
||||
nextPow2 = pow(2, math.ceil(math.log2(n)))
|
||||
|
||||
table_size = (4 * nextPow2) if (n < 2048) else (2 * nextPow2)
|
||||
if table_size < 512:
|
||||
table_size = 512
|
||||
num_funcs = 3
|
||||
key_buf = jt.zeros((table_size,), dtype='int64')
|
||||
val_buf = jt.zeros((table_size,), dtype='int64')
|
||||
key = jt.zeros((num_funcs * table_size,), dtype='int64')
|
||||
val = jt.zeros((num_funcs * table_size,), dtype='int64')
|
||||
|
||||
output = jt.code(queries.shape, jt.int64, [queries, references, indices, key_buf, val_buf, key, val],
|
||||
cuda_header="""
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
/** Reserved value for indicating "empty". */
|
||||
#define EMPTY_CELL (0)
|
||||
/** Max rehashing depth, and error depth. */
|
||||
#define MAX_DEPTH (100)
|
||||
#define ERR_DEPTH (-1)
|
||||
/** CUDA naive thread block size. */
|
||||
#define BLOCK_SIZE (256)
|
||||
/** CUDA multi-level thread block size = bucket size. */
|
||||
#define BUCKET_SIZE (512)
|
||||
|
||||
__device__ static uint64_t atomicExch(uint64_t *addr, uint64_t val) {
|
||||
return (uint64_t)atomicExch((unsigned long long int *)addr,
|
||||
(unsigned long long int)val);
|
||||
}
|
||||
|
||||
/** Struct of a hash function config. */
|
||||
typedef struct {
|
||||
int rv; // Randomized XOR value.
|
||||
int ss; // Randomized shift filter start position.
|
||||
} FuncConfig;
|
||||
|
||||
/** Hard code hash functions and all inline helper functions for CUDA kernels'
|
||||
* use. */
|
||||
inline __device__ int do_1st_hash(const uint64_t val, const int num_buckets) {
|
||||
return val % num_buckets;
|
||||
}
|
||||
|
||||
inline __device__ int do_2nd_hash(const uint64_t val,
|
||||
const FuncConfig *const hash_func_configs,
|
||||
const int func_idx, const int size) {
|
||||
FuncConfig fc = hash_func_configs[func_idx];
|
||||
return ((val ^ fc.rv) >> fc.ss) % size; // XOR function as 2nd-level hashing.
|
||||
}
|
||||
|
||||
// trying to ignore EMPTY_CELL by adding 1 at make_data.
|
||||
inline __device__ uint64_t fetch_val(const uint64_t data, const int pos_width) {
|
||||
return data >> pos_width;
|
||||
}
|
||||
|
||||
inline __device__ int fetch_func(const uint64_t data, const int pos_width) {
|
||||
return data & ((0x1 << pos_width) - 1);
|
||||
}
|
||||
|
||||
inline __device__ uint64_t make_data(const uint64_t val, const int func,
|
||||
const int pos_width) {
|
||||
return (val << pos_width) ^ func;
|
||||
}
|
||||
|
||||
class CuckooHashTableCuda_Multi {
|
||||
private:
|
||||
const int _size;
|
||||
const int _evict_bound;
|
||||
const int _num_funcs;
|
||||
const int _pos_width;
|
||||
const int _num_buckets;
|
||||
|
||||
FuncConfig *_d_hash_func_configs;
|
||||
|
||||
/** Cuckoo hash function set. */
|
||||
FuncConfig *_hash_func_configs;
|
||||
|
||||
/** Private operations. */
|
||||
void gen_hash_funcs() {
|
||||
// Calculate bit width of value range and table size.
|
||||
int val_width = 8 * sizeof(uint64_t) - ceil(log2((double)_num_funcs));
|
||||
int bucket_width = ceil(log2((double)_num_buckets));
|
||||
int size_width = ceil(log2((double)BUCKET_SIZE));
|
||||
// Generate randomized configurations.
|
||||
for (int i = 0; i < _num_funcs; ++i) { // At index 0 is a dummy function.
|
||||
if (val_width - bucket_width <= size_width)
|
||||
_hash_func_configs[i] = {rand(), 0};
|
||||
else {
|
||||
_hash_func_configs[i] = {
|
||||
rand(), rand() % (val_width - bucket_width - size_width + 1) +
|
||||
bucket_width};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline uint64_t fetch_val(const uint64_t data) { return data >> _pos_width; }
|
||||
inline int fetch_func(const uint64_t data) {
|
||||
return data & ((0x1 << _pos_width) - 1);
|
||||
}
|
||||
|
||||
public:
|
||||
CuckooHashTableCuda_Multi(const int size, const int evict_bound,
|
||||
const int num_funcs)
|
||||
: _size(size),
|
||||
_evict_bound(evict_bound),
|
||||
_num_funcs(num_funcs),
|
||||
_pos_width(ceil(log2((double)_num_funcs))),
|
||||
_num_buckets(ceil((double)_size / BUCKET_SIZE)) {
|
||||
srand(time(NULL));
|
||||
_d_hash_func_configs = NULL;
|
||||
_hash_func_configs = NULL;
|
||||
_hash_func_configs = new FuncConfig[num_funcs];
|
||||
|
||||
gen_hash_funcs();
|
||||
|
||||
cudaMalloc((void **)&_d_hash_func_configs, _num_funcs * sizeof(FuncConfig));
|
||||
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
|
||||
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
|
||||
};
|
||||
~CuckooHashTableCuda_Multi() {
|
||||
if (_hash_func_configs != NULL) delete[] _hash_func_configs;
|
||||
|
||||
if (_d_hash_func_configs != NULL) cudaFree(_d_hash_func_configs);
|
||||
};
|
||||
|
||||
int insert_vals(const uint64_t *const keys, const uint64_t *const vals,
|
||||
uint64_t *d_key_buf, uint64_t *d_val_buf, uint64_t *d_key,
|
||||
uint64_t *d_val, const int n);
|
||||
|
||||
void lookup_vals(const uint64_t *const keys, uint64_t *const results,
|
||||
uint64_t *d_key, uint64_t *d_val, const int n);
|
||||
};
|
||||
|
||||
__global__ void cuckooBucketKernel_Multi(
|
||||
uint64_t *const key_buf, uint64_t *const val_buf, const int size,
|
||||
const uint64_t *const keys, const uint64_t *const vals, const int n,
|
||||
int *const counters, const int num_buckets);
|
||||
|
||||
__global__ void cuckooInsertKernel_Multi(
|
||||
uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
|
||||
const uint64_t *const val_buf, const int size,
|
||||
const FuncConfig *const hash_func_configs, const int num_funcs,
|
||||
const int *const counters, const int num_buckets, const int evict_bound,
|
||||
const int pos_width, int *const rehash_requests);
|
||||
|
||||
__global__ void cuckooLookupKernel_Multi(
|
||||
const uint64_t *const keys, uint64_t *const results, const int n,
|
||||
const uint64_t *const all_keys, const uint64_t *const all_vals,
|
||||
const int size, const FuncConfig *const hash_func_configs,
|
||||
const int num_funcs, const int num_buckets, const int pos_width);
|
||||
|
||||
__global__ void cuckooBucketKernel_Multi(
|
||||
uint64_t *const key_buf, uint64_t *const val_buf, const int size,
|
||||
const uint64_t *const keys, const uint64_t *const vals, const int n,
|
||||
int *const counters, const int num_buckets) {
|
||||
// Get thread index.
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
// Only threads within range are active.
|
||||
if (idx < n) {
|
||||
// Do 1st-level hashing to get bucket id, then do atomic add to get index
|
||||
// inside the bucket.
|
||||
uint64_t key = keys[idx];
|
||||
uint64_t val = vals[idx];
|
||||
|
||||
int bucket_num = do_1st_hash(key, num_buckets);
|
||||
int bucket_ofs = atomicAdd(&counters[bucket_num], 1);
|
||||
|
||||
// Directly write the key into the table buffer.
|
||||
if (bucket_ofs >= BUCKET_SIZE) {
|
||||
printf("%d/%d ERROR: bucket overflow! n=%d, bucket_num=%d/%d, key=%d", bucket_ofs, BUCKET_SIZE, n, bucket_num, num_buckets, key);
|
||||
} else {
|
||||
key_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = key;
|
||||
val_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cuckooInsertKernel_Multi(
|
||||
uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
|
||||
const uint64_t *const val_buf, const int size,
|
||||
const FuncConfig *const hash_func_configs, const int num_funcs,
|
||||
const int *const counters, const int num_buckets, const int evict_bound,
|
||||
const int pos_width, int *const rehash_requests) {
|
||||
// Create local cuckoo table in shared memory. Size passed in as the third
|
||||
// kernel parameter.
|
||||
extern __shared__ uint64_t local_key[];
|
||||
for (int i = 0; i < num_funcs; ++i) {
|
||||
local_key[i * BUCKET_SIZE + threadIdx.x] = EMPTY_CELL;
|
||||
}
|
||||
|
||||
// might be useful
|
||||
__syncthreads();
|
||||
|
||||
// Get thread index.
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
uint64_t cur_idx = idx;
|
||||
|
||||
// Only threads within local bucket range are active.
|
||||
if (threadIdx.x < counters[blockIdx.x]) {
|
||||
// Set initial conditions.
|
||||
uint64_t cur_key = key_buf[cur_idx];
|
||||
int cur_func = 0;
|
||||
int evict_count = 0;
|
||||
|
||||
// Start the test-kick-and-reinsert loops.
|
||||
do {
|
||||
int pos = do_2nd_hash(cur_key, hash_func_configs, cur_func, BUCKET_SIZE);
|
||||
|
||||
uint64_t new_data = make_data(cur_idx + 1, cur_func, pos_width);
|
||||
|
||||
uint64_t old_idx =
|
||||
atomicExch(&local_key[cur_func * BUCKET_SIZE + pos], new_data);
|
||||
|
||||
if (old_idx != EMPTY_CELL) {
|
||||
cur_idx = fetch_val(old_idx, pos_width) - 1;
|
||||
// potential overflow here. It seems that cur_idx < 0 is possible!
|
||||
cur_key = key_buf[cur_idx];
|
||||
cur_func = (fetch_func(old_idx, pos_width) + 1) % num_funcs;
|
||||
evict_count++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
} while (evict_count < num_funcs * evict_bound);
|
||||
|
||||
// If exceeds eviction bound, then needs rehashing.
|
||||
if (evict_count >= num_funcs * evict_bound) {
|
||||
atomicAdd(rehash_requests, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Every thread write its responsible local slot into the global data table.
|
||||
__syncthreads();
|
||||
for (int i = 0; i < num_funcs; ++i) {
|
||||
uint64_t cur_idx = local_key[i * BUCKET_SIZE + threadIdx.x];
|
||||
if (cur_idx == EMPTY_CELL) {
|
||||
continue;
|
||||
}
|
||||
int cur_func = fetch_func(cur_idx, pos_width);
|
||||
cur_idx = fetch_val(cur_idx, pos_width) - 1;
|
||||
key[i * size + idx] = key_buf[cur_idx];
|
||||
val[i * size + idx] = val_buf[cur_idx];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cuckooLookupKernel_Multi(
|
||||
const uint64_t *const keys, uint64_t *const results, const int n,
|
||||
const uint64_t *const all_keys, const uint64_t *const all_vals,
|
||||
const int size, const FuncConfig *const hash_func_configs,
|
||||
const int num_funcs, const int num_buckets, const int pos_width) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
// Only threads within range are active.
|
||||
if (idx < n) {
|
||||
uint64_t key = keys[idx];
|
||||
int bucket_num = do_1st_hash(key, num_buckets);
|
||||
for (int i = 0; i < num_funcs; ++i) {
|
||||
int pos = bucket_num * BUCKET_SIZE +
|
||||
do_2nd_hash(key, hash_func_configs, i, BUCKET_SIZE);
|
||||
if (all_keys[i * size + pos] == key) {
|
||||
results[idx] = all_vals[i * size + pos] + 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(Haotian): should be a value that will not be encountered.
|
||||
results[idx] = EMPTY_CELL;
|
||||
}
|
||||
}
|
||||
|
||||
void CuckooHashTableCuda_Multi::lookup_vals(const uint64_t *const keys,
|
||||
uint64_t *d_key, uint64_t *d_val,
|
||||
uint64_t *const results,
|
||||
const int n) {
|
||||
// Launch the lookup kernel.
|
||||
cuckooLookupKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
|
||||
keys, results, n, d_key, d_val, _size, _d_hash_func_configs, _num_funcs,
|
||||
_num_buckets, _pos_width);
|
||||
}
|
||||
|
||||
int CuckooHashTableCuda_Multi::insert_vals(const uint64_t *const keys,
|
||||
const uint64_t *const vals,
|
||||
uint64_t *d_key_buf,
|
||||
uint64_t *d_val_buf, uint64_t *d_key,
|
||||
uint64_t *d_val, const int n) {
|
||||
//
|
||||
// Phase 1: Distribute keys into buckets.
|
||||
//
|
||||
|
||||
// Allocate GPU memory.
|
||||
|
||||
int *d_counters = NULL;
|
||||
|
||||
cudaMalloc((void **)&d_counters, _num_buckets * sizeof(int));
|
||||
|
||||
cudaMemset(d_counters, 0, _num_buckets * sizeof(int));
|
||||
|
||||
// Invoke bucket kernel.
|
||||
cuckooBucketKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
|
||||
d_key_buf, d_val_buf, _size, keys, vals, n, d_counters, _num_buckets);
|
||||
|
||||
//
|
||||
// Phase 2: Local cuckoo hashing.
|
||||
//
|
||||
|
||||
// Allocate GPU memory.
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
int *d_rehash_requests = NULL;
|
||||
|
||||
cudaMalloc((void **)&d_rehash_requests, sizeof(int));
|
||||
|
||||
// Copy values onto GPU memory.
|
||||
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
|
||||
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
|
||||
|
||||
// Invoke insert kernel. Passes shared memory table size by the third
|
||||
// argument. Loops until no rehashing needed.
|
||||
|
||||
int rehash_count = 0;
|
||||
do {
|
||||
int rehash_requests = 0;
|
||||
cudaMemset(d_rehash_requests, 0, sizeof(int));
|
||||
cuckooInsertKernel_Multi<<<ceil((double)_size / BUCKET_SIZE), BUCKET_SIZE,
|
||||
_num_funcs * BUCKET_SIZE * sizeof(uint64_t)>>>(
|
||||
d_key, d_val, d_key_buf, d_val_buf, _size, _d_hash_func_configs,
|
||||
_num_funcs, d_counters, _num_buckets, _evict_bound, _pos_width,
|
||||
d_rehash_requests);
|
||||
cudaMemcpy(&rehash_requests, d_rehash_requests, sizeof(int),
|
||||
cudaMemcpyDeviceToHost);
|
||||
|
||||
if (rehash_requests == 0) {
|
||||
break;
|
||||
} else {
|
||||
rehash_count++;
|
||||
gen_hash_funcs();
|
||||
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
|
||||
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
|
||||
}
|
||||
} while (rehash_count < MAX_DEPTH);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Free GPU resources.
|
||||
|
||||
if (d_counters != NULL) {
|
||||
cudaFree(d_counters);
|
||||
}
|
||||
if (d_rehash_requests != NULL) {
|
||||
cudaFree(d_rehash_requests);
|
||||
}
|
||||
|
||||
return (rehash_count < MAX_DEPTH) ? rehash_count : ERR_DEPTH;
|
||||
}
|
||||
|
||||
@alias(hash_query, in0)
|
||||
@alias(hash_target, in1)
|
||||
@alias(idx_target, in2)
|
||||
@alias(key_buf, in3)
|
||||
@alias(val_buf, in4)
|
||||
@alias(key, in5)
|
||||
@alias(val, in6)
|
||||
""",
|
||||
cuda_src="""
|
||||
int n = hash_target_shape0;
|
||||
int n1 = hash_query_shape0;
|
||||
const int nextPow2 = pow(2, ceil(log2((double)n)));
|
||||
// When n is large, the hash values tend to be more evenly distrubuted and
|
||||
// choosing table_size to be 2 * nextPow2 typically suffices. For smaller n,
|
||||
// the effect of uneven distribution of hash values is more pronounced and
|
||||
// hence we choose table_size to be 4 * nextPow2 to reduce the chance of
|
||||
// bucket overflow.
|
||||
int table_size = (n < 2048) ? 4 * nextPow2 : 2 * nextPow2;
|
||||
if (table_size < 512) {
|
||||
table_size = 512;
|
||||
}
|
||||
int num_funcs = 3;
|
||||
|
||||
CuckooHashTableCuda_Multi in_hash_table(table_size, 8 * ceil(log2((double)n)),
|
||||
num_funcs);
|
||||
in_hash_table.insert_vals((uint64_t *)(hash_target_p),
|
||||
(uint64_t *)(idx_target_p),
|
||||
(uint64_t *)(key_buf_p),
|
||||
(uint64_t *)(val_buf_p),
|
||||
(uint64_t *)(key_p),
|
||||
(uint64_t *)(val_p), n);
|
||||
in_hash_table.lookup_vals((uint64_t *)(hash_query_p),
|
||||
(uint64_t *)(key_p),
|
||||
(uint64_t *)(val_p),
|
||||
(uint64_t *)(out_p), n1);
|
||||
|
||||
"""
|
||||
)
|
||||
else:
|
||||
output = jt.code(queries.shape, jt.int64, [queries, references, indices],
|
||||
cpu_header="""
|
||||
#include <cmath>
|
||||
#include <google/dense_hash_map>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
@alias(hash_query, in0)
|
||||
@alias(hash_target, in1)
|
||||
@alias(idx_target, in2)
|
||||
""",
|
||||
cpu_src="""
|
||||
int n = hash_target_shape0;
|
||||
int n1 = hash_query_shape0;
|
||||
|
||||
google::dense_hash_map<int64_t, int64_t> hashmap; // Google Sparse Hash library
|
||||
hashmap.set_empty_key(0);
|
||||
|
||||
for (int idx = 0; idx < n; ++ idx ) {
|
||||
int64_t key = @hash_target(idx);
|
||||
int64_t val = @idx_target(idx) + 1;
|
||||
hashmap.insert(std::make_pair(key, val));
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int idx = 0; idx < n1; ++ idx ) {
|
||||
int64_t key = @hash_query(idx);
|
||||
auto iter = hashmap.find(key);
|
||||
if (iter != hashmap.end()) {
|
||||
@out(idx) = iter->second;
|
||||
} else @out(idx) = 0;
|
||||
}
|
||||
"""
|
||||
)
|
||||
output = (output - 1).view(*q_size)
|
||||
return output
|
||||
# return indices
|
|
@ -1,114 +0,0 @@
|
|||
import jittor as jt
|
||||
from jittor import Function
|
||||
|
||||
from JSparse import SparseTensor
|
||||
|
||||
__all__ = ['spvoxelize']
|
||||
|
||||
class Voxelize(Function):
|
||||
def execute(
|
||||
self,
|
||||
values: jt.Var,
|
||||
idx_query: jt.Var,
|
||||
counts: jt.Var
|
||||
) -> jt.Var:
|
||||
# N = values_shape0
|
||||
# c = values_shape1
|
||||
# N1 = counts_shape0
|
||||
# out: N1 x c
|
||||
output = jt.code((counts.shape[0], values.shape[1]), "float32", [values, idx_query, counts],
|
||||
cuda_header="""
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ void voxelize_forward_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
@alias(values, in0)
|
||||
@alias(idx_query, in1)
|
||||
@alias(counts, in2)
|
||||
|
||||
int index = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int c = values_shape1;
|
||||
int i = index / c;
|
||||
int j = index % c;
|
||||
|
||||
if (i < values_shape0) {
|
||||
int pos = @idx_query(i);
|
||||
if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
|
||||
atomicAdd(&@out(pos, j), @values(i, j) / (float)(@counts(pos)));
|
||||
}
|
||||
}
|
||||
@alias(values, in0)
|
||||
voxelize_forward_kernel<<< values_shape0, values_shape1 >>>(@ARGS);
|
||||
""",
|
||||
cpu_src="""
|
||||
@alias(values, in0)
|
||||
@alias(idx_query, in1)
|
||||
@alias(counts, in2)
|
||||
for (int i = 0; i < values_shape0; ++ i ) {
|
||||
int pos = @idx_query(i);
|
||||
if (@counts(pos) == 0)
|
||||
continue;
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < values_shape1; ++ j ) {
|
||||
#pragma omp atomic
|
||||
@out(pos, j) += @values(i, j) / (float)@counts(pos);
|
||||
}
|
||||
}
|
||||
"""
|
||||
)
|
||||
self.save_vars = idx_query, counts, values.shape[0]
|
||||
return output
|
||||
|
||||
def grad(self, grad_output: jt.Var):
|
||||
idx_query, counts, input_size = self.save_vars
|
||||
|
||||
grad_values = jt.code((input_size, grad_output.shape[1]), jt.float32, [idx_query, counts, grad_output],
|
||||
cuda_header="""
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
""",
|
||||
cuda_src="""
|
||||
__global__ void voxelize_backward_kernel(@ARGS_DEF) {
|
||||
@PRECALC
|
||||
@alias(idx_query, in0)
|
||||
@alias(counts, in1)
|
||||
@alias(grad_output, in2)
|
||||
int index = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int i = index / grad_output_shape1;
|
||||
int j = index % grad_output_shape1;
|
||||
if (i < out_shape0) {
|
||||
int pos = @idx_query(i);
|
||||
if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
|
||||
atomicAdd(&@out(pos, j), @grad_output(pos, j) / @counts(pos));
|
||||
}
|
||||
}
|
||||
|
||||
voxelize_backward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
|
||||
""",
|
||||
cpu_src="""
|
||||
@alias(idx_query, in0)
|
||||
@alias(counts, in1)
|
||||
@alias(grad_output, in2)
|
||||
|
||||
for (int i = 0; i < out_shape0; ++ i ) {
|
||||
int pos = @idx_query(i);
|
||||
if (@counts(pos) == 0) continue;
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < grad_output_shape1; ++ j ) {
|
||||
@out(i, j) = @grad_output(pos, j) / (float)@counts(pos);
|
||||
}
|
||||
}
|
||||
"""
|
||||
)
|
||||
return grad_values, None, None
|
||||
|
||||
def spvoxelize(
|
||||
values: jt.Var,
|
||||
idx_query: jt.Var,
|
||||
counts: jt.Var
|
||||
) -> jt.Var:
|
||||
return Voxelize.apply(values, idx_query, counts)
|
|
@ -1,6 +0,0 @@
|
|||
from .activation import *
|
||||
# from .bev import *
|
||||
from .conv import *
|
||||
# from .crop import *
|
||||
from .norm import *
|
||||
from .pooling import *
|
|
@ -1,21 +0,0 @@
|
|||
import jittor as jt
|
||||
from jittor import nn
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn.functional import relu, leaky_relu
|
||||
# from nn.utils import fapply
|
||||
|
||||
__all__ = ['ReLU', 'LeakyReLU']
|
||||
|
||||
# class ReLU(nn.ReLU):
|
||||
# def execute(self, input: SparseTensor) -> SparseTensor:
|
||||
# return fapply(input, super().execute)
|
||||
|
||||
# class LeakyReLU(nn.LeakyReLU):
|
||||
# def execute(self, input: SparseTensor) -> SparseTensor:
|
||||
# return fapply(input, super().execute)
|
||||
|
||||
Relu = jt.make_module(relu)
|
||||
ReLU = Relu
|
||||
Leaky_relu = jt.make_module(leaky_relu, 2)
|
||||
LeakyReLU = Leaky_relu
|
|
@ -1,85 +0,0 @@
|
|||
import math
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import jittor as jt
|
||||
from jittor import nn
|
||||
from jittor import init
|
||||
from jittor.misc import _pair, _triple
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn import functional as F
|
||||
# from utils import make_ntuple
|
||||
|
||||
__all__ = ['Conv3d']
|
||||
|
||||
class Conv3d(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: Union[int, Tuple[int, ...]] = 3,
|
||||
stride: Union[int, Tuple[int, ...]] = 1,
|
||||
dilation: int = 1,
|
||||
groups: int = 1,
|
||||
bias: bool = False,
|
||||
transposed: bool = False) -> None:
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
# self.kernel_size = make_ntuple(kernel_size, ndim=3)
|
||||
# self.stride = make_ntuple(stride, ndim=3)
|
||||
# self.dilation = dilation
|
||||
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
|
||||
self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
|
||||
self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation, dilation)
|
||||
self.groups = groups
|
||||
assert in_channels % groups == 0, 'in_channels must be divisible by groups'
|
||||
assert out_channels % groups == 0, 'out_channels must be divisible by groups'
|
||||
self.transposed = transposed
|
||||
|
||||
self.kernel_volume = int(np.prod(self.kernel_size))
|
||||
# if self.kernel_volume > 1:
|
||||
# self.kernel = nn.Parameter(
|
||||
# jt.zeros(self.kernel_volume, in_channels, out_channels))
|
||||
# else:
|
||||
# self.kernel = nn.Parameter(jt.zeros(in_channels, out_channels))
|
||||
# if bias:
|
||||
# self.bias = nn.Parameter(jt.zeros(out_channels))
|
||||
# else:
|
||||
# self.register_parameter('bias', None)
|
||||
# self.reset_parameters()
|
||||
fan = (self.out_channels if self.transposed else self.in_channels) * self.kernel_volume
|
||||
std = 1 / math.sqrt(fan)
|
||||
|
||||
if self.kernel_volume > 1:
|
||||
self.weight = init.uniform([self.kernel_volume, in_channels, out_channels], 'float32', -std, std)
|
||||
else:
|
||||
self.weight = init.uniform([in_channels, out_channels], 'float32')
|
||||
if bias:
|
||||
self.bias = init.uniform([out_channels], "float32", -std, std)
|
||||
else:
|
||||
self.bias = None
|
||||
# self.reset_parameters()
|
||||
|
||||
def execute(self, input: SparseTensor) -> SparseTensor:
|
||||
return F.conv3d(input,
|
||||
weight=self.weight,
|
||||
kernel_size=self.kernel_size,
|
||||
bias=self.bias,
|
||||
stride=self.stride,
|
||||
dilation=self.dilation,
|
||||
groups=self.groups,
|
||||
transposed=self.transposed)
|
||||
|
||||
# def set_parameters(self) -> None:
|
||||
# std = 1 / math.sqrt(
|
||||
# (self.out_channels if self.transposed else self.in_channels)
|
||||
# * self.kernel_volume)
|
||||
# self.weight *= std
|
||||
# if self.bias is not None:
|
||||
# self.bias *= std
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
import jittor as jt
|
||||
from jittor import nn
|
||||
from numpy import kaiser
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn.utils import fapply
|
||||
|
||||
__all__ = ['BatchNorm', 'GroupNorm']
|
||||
|
||||
class BatchNorm(nn.BatchNorm):
|
||||
def execute(self, input: SparseTensor) -> SparseTensor:
|
||||
return fapply(input, super().execute)
|
||||
|
||||
class GroupNorm(nn.GroupNorm):
|
||||
def execute(self, input: SparseTensor) -> SparseTensor:
|
||||
indices, values, stride, size = input.indices, input.values, input.stride, input.size
|
||||
|
||||
batch_size = jt.max(indices[:, 0]).item() + 1
|
||||
num_channels = values.shape[1]
|
||||
|
||||
n_values = jt.zeros_like(values)
|
||||
for k in range(batch_size):
|
||||
idx = indices[:, 0] == k
|
||||
b_values = values[idx]
|
||||
b_values = b_values.t().reshape(1, num_channels, -1)
|
||||
b_values = super().execute(b_values)
|
||||
b_values = b_values.reshape(num_channels, -1).t()
|
||||
n_values[idx] = b_values
|
||||
|
||||
output = SparseTensor(indices=indices, values=n_values, stride=stride, size=size)
|
||||
output.cmaps = input.cmaps
|
||||
output.kmaps = input.kmaps
|
||||
return output
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
from ast import Global
|
||||
import jittor as jt
|
||||
from jittor import nn
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse.nn.functional import global_avg_pool, global_max_pool
|
||||
|
||||
__all__ = ['GlobalAvgPool', 'GlobalMaxPool']
|
||||
|
||||
GlobalAvgPool = jt.make_module(global_avg_pool)
|
||||
GlobalMaxPool = jt.make_module(global_max_pool)
|
|
@ -1,2 +0,0 @@
|
|||
from .apply import *
|
||||
from .kernel import *
|
|
@ -1,15 +0,0 @@
|
|||
from typing import Callable
|
||||
|
||||
import jittor as jt
|
||||
|
||||
from JSparse import SparseTensor
|
||||
|
||||
__all__ = ['fapply']
|
||||
|
||||
def fapply(input: SparseTensor, fn: Callable[..., jt.Var], *args,
|
||||
**kwargs) -> SparseTensor:
|
||||
values = fn(input.values, *args, **kwargs)
|
||||
output = SparseTensor(indices=input.indices, values=values, stride=input.stride, size=input.size)
|
||||
output.cmaps = input.cmaps
|
||||
output.kmaps = input.kmaps
|
||||
return output
|
|
@ -1,28 +0,0 @@
|
|||
from typing import Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import jittor as jt
|
||||
|
||||
from JSparse.utils import make_ntuple, trunc
|
||||
|
||||
__all__ = ['get_kernel_offsets']
|
||||
|
||||
def get_kernel_offsets(kernel_size: Union[int, Tuple[int, ...]],
|
||||
stride: Union[int, Tuple[int, ...]] = 1,
|
||||
dilation: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
|
||||
kernel_size = make_ntuple(kernel_size, ndim=3)
|
||||
stride = make_ntuple(stride, ndim=3)
|
||||
dilation = make_ntuple(dilation, ndim=3)
|
||||
|
||||
offsets = [(np.arange(-kernel_size[k] // 2 + 1, kernel_size[k] // 2 + 1) * stride[k]
|
||||
* dilation[k]) for k in range(3)]
|
||||
|
||||
if np.prod(kernel_size) % 2 == 1:
|
||||
offsets = [[x, y, z] for z in offsets[2] for y in offsets[1]
|
||||
for x in offsets[0]]
|
||||
else:
|
||||
offsets = [[x, y, z] for x in offsets[0] for y in offsets[1]
|
||||
for z in offsets[2]]
|
||||
|
||||
offsets = jt.int32(offsets)
|
||||
return offsets
|
|
@ -1,125 +0,0 @@
|
|||
from itertools import count
|
||||
import numpy as np
|
||||
|
||||
import jittor as jt
|
||||
from jittor.misc import _pair, _triple
|
||||
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
from type_check import type_check
|
||||
|
||||
from JSparse.utils import make_ntuple, sparse_quantize, set_hash
|
||||
# from .utils.quantize import sparse_quantize
|
||||
# from indice_manager import IndiceManager
|
||||
|
||||
class SparseTensor:
|
||||
|
||||
@type_check
|
||||
def __init__(
|
||||
self,
|
||||
indices: jt.Var,
|
||||
values: jt.Var,
|
||||
stride: Union[int, Tuple[int, ...]],
|
||||
size=None,
|
||||
quantize=True,
|
||||
voxel_size=1,
|
||||
coalesce_mode:str='sum',
|
||||
indice_manager=None,
|
||||
device=None,
|
||||
):
|
||||
assert (values.ndim == 2)
|
||||
# self.indices = indices
|
||||
# self.values = values
|
||||
self.size = size
|
||||
self.ndim = indices.shape[1] - 1
|
||||
self.stride =_triple(stride)
|
||||
self.voxel_size = voxel_size
|
||||
self.coalesce_mode = coalesce_mode
|
||||
self.cmaps = {}
|
||||
self.kmaps = {}
|
||||
|
||||
##########################
|
||||
# Setup CoordsManager
|
||||
##########################
|
||||
# if indice_manager is None:
|
||||
# self.indice_manager = IndiceManager(
|
||||
# ndim=self.ndim,
|
||||
|
||||
# )
|
||||
|
||||
##########################
|
||||
# Initialize coords
|
||||
##########################
|
||||
if quantize:
|
||||
self.seed = 1
|
||||
for i in range(len(self.stride)):
|
||||
self.seed += i
|
||||
self.seed *= self.stride[i]
|
||||
self.hash_multiplier = set_hash(self.ndim, self.seed)
|
||||
|
||||
self.hash_num, self.indices, mapping, inverse_mapping, count = sparse_quantize(indices, self.hash_multiplier, self.voxel_size, return_index=True, return_inverse=True, return_count=True)
|
||||
self.inverse_mapping = inverse_mapping
|
||||
|
||||
if len(values.shape) == 1:
|
||||
out_size = (self.indices.shape[0], )
|
||||
elif len(values.shape) == 2:
|
||||
out_size = (self.indices.shape[0], values.shape[-1])
|
||||
|
||||
if self.coalesce_mode == 'sum':
|
||||
out_size = (self.indices.shape[0], values.shape[-1])
|
||||
self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
|
||||
elif self.coalesce_mode == 'average':
|
||||
out_size = (self.indices.shape[0], values.shape[-1])
|
||||
self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
|
||||
self.values /= count
|
||||
elif self.coalesce_mode == 'sample':
|
||||
self.values = values[self.indices]
|
||||
else:
|
||||
self.indices = indices
|
||||
self.values = values
|
||||
|
||||
# if indice_manager is None:
|
||||
# # TODO If set to share the indices man, use the global indices man
|
||||
|
||||
# # init the indices
|
||||
# indice_manager = Indice
|
||||
|
||||
|
||||
def _indices(self):
|
||||
return self.indices
|
||||
|
||||
def _values(self):
|
||||
return self.values
|
||||
|
||||
def _binary_operation(self, other, _binary_op):
|
||||
assert isinstance(other, self.__class__)
|
||||
return
|
||||
# TODO set up the indices dict
|
||||
# so that wedo not need to merge the indice group
|
||||
# which has already been merged
|
||||
|
||||
# if the indices of self and other should be merged
|
||||
|
||||
|
||||
class PointTensor:
|
||||
|
||||
def __init__(self, values, indices, idx_query=None, weights=None):
|
||||
self.values = values
|
||||
self.indices = indices
|
||||
self.idx_query = idx_query if idx_query is not None else {}
|
||||
self.weights = weights if weights is not None else {}
|
||||
self.additional_values = {}
|
||||
self.additional_values['idx_query'] = {}
|
||||
self.additional_values['counts'] = {}
|
||||
|
||||
def detach(self):
|
||||
self.values = self.values.detach()
|
||||
self.indices = self.indices.detach()
|
||||
return self
|
||||
|
||||
def __add__(self, other):
|
||||
pt = PointTensor(self.values + other.values, self.indices, self.idx_query,
|
||||
self.weights)
|
||||
pt.additional_values = self.additional_values
|
||||
return pt
|
||||
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
import jittor as jt
|
||||
from jittor import Function
|
||||
|
||||
|
||||
|
||||
|
||||
def spmm(
|
||||
rows: jt.Var,
|
||||
cols: jt.Var,
|
||||
vals: jt.Var,
|
||||
size: jt.NanoVector,
|
||||
mat: jt.Var,
|
||||
spmm_mode='scatter',
|
||||
is_sorted: bool = False,
|
||||
cuda_spmm_alg: int = 1,
|
||||
) -> jt.Var:
|
||||
assert len(rows) == len(cols), "Invalid length"
|
||||
assert len(rows) == len(vals), "Invalid length"
|
||||
assert vals.dtype == mat.dtype, "dtype mismatch"
|
||||
|
||||
if jt.flags.use_cuda > 1:
|
||||
assert jt.has_cuda == 1, "No GPUs available"
|
||||
rows = rows.int()
|
||||
cols = cols.int()
|
||||
'''
|
||||
TODO: Using the coo_spmm of cuSPARSE on GPU
|
||||
result = coo_spmm_int32(
|
||||
rows, cols, vals, size[0], size[1], mat, cuda_spmm_alg, is_sorted
|
||||
)
|
||||
'''
|
||||
else:
|
||||
if (spmm_mode == 'scatter'):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class SPMM(Function):
|
|
@ -1,2 +0,0 @@
|
|||
from .quantize import *
|
||||
from .utils import *
|
|
@ -1,58 +0,0 @@
|
|||
from itertools import repeat
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import jittor as jt
|
||||
import numpy as np
|
||||
|
||||
from .utils import unique1d
|
||||
|
||||
__all__ = ['sparse_quantize', 'set_hash']
|
||||
|
||||
def set_hash(ndim, seed, low=100, high=1000):
|
||||
jt.set_seed(seed)
|
||||
return jt.randint(low, high, shape=(ndim + 1,), dtype='uint64')
|
||||
|
||||
def hash(x: np.ndarray, multiplier: np.ndarray) -> jt.Var:
|
||||
assert x.ndim == 2, x.shape
|
||||
|
||||
x = x - x.min(dim=0)
|
||||
x = x.uint64()
|
||||
|
||||
h = jt.zeros(x.shape[0], dtype='uint64')
|
||||
for k in range(x.shape[1] - 1):
|
||||
h += x[:, k]
|
||||
h *= multiplier[k]
|
||||
h += x[:, -1]
|
||||
return h
|
||||
|
||||
def sparse_quantize(indices,
|
||||
hash_multiplier,
|
||||
voxel_size: Union[float, Tuple[float, ...]] = 1,
|
||||
*,
|
||||
return_index: bool = False,
|
||||
return_inverse: bool = False,
|
||||
return_count: bool = False) -> List[np.ndarray]:
|
||||
if indices.dtype.is_int() and voxel_size == 1:
|
||||
pass
|
||||
else:
|
||||
if isinstance(voxel_size, (float, int)):
|
||||
voxel_size = tuple(repeat(voxel_size, 3))
|
||||
assert isinstance(voxel_size, tuple) and len(voxel_size) == 3
|
||||
|
||||
voxel_size = jt.Var(voxel_size)
|
||||
indices[:, 1:] /= voxel_size
|
||||
indices = jt.floor(indices).astype(jt.int32)
|
||||
|
||||
hash_num, mapping, inverse_mapping, count = unique1d(hash(indices, hash_multiplier))
|
||||
indices = indices[mapping]
|
||||
|
||||
outputs = [hash_num, indices]
|
||||
if return_index:
|
||||
outputs += [mapping]
|
||||
if return_inverse:
|
||||
outputs += [inverse_mapping]
|
||||
if return_count:
|
||||
outputs += [count]
|
||||
return outputs[0] if len(outputs) == 1 else outputs
|
||||
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
from itertools import repeat
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import jittor as jt
|
||||
|
||||
__all__ = ['make_ntuple', 'trunc', 'unique1d']
|
||||
|
||||
def make_ntuple(x: Union[int, List[int], Tuple[int, ...], jt.Var],
|
||||
ndim: int) -> Tuple[int, ...]:
|
||||
if isinstance(x, int):
|
||||
x = tuple(repeat(x, ndim))
|
||||
elif isinstance(x, list):
|
||||
x = tuple(x)
|
||||
elif isinstance(x, jt.Var):
|
||||
x = tuple(x.view(-1).numpy().tolist())
|
||||
|
||||
assert isinstance(x, tuple) and len(x) == ndim, x
|
||||
return x
|
||||
|
||||
def trunc(x: jt.Var):
|
||||
if x >= 0:
|
||||
return jt.floor(x)
|
||||
else:
|
||||
return jt.ceil(x)
|
||||
|
||||
|
||||
def unique1d(var):
|
||||
assert len(var.shape) == 1
|
||||
perm, aux = jt.argsort(var)
|
||||
mask = jt.empty(aux.shape, dtype='bool')
|
||||
mask[:1] = True
|
||||
mask[1:] = aux[1:] != aux[:-1]
|
||||
ret = (aux[mask],)
|
||||
ret += (perm[mask],)
|
||||
imask = jt.cumsum(mask.astype(perm.dtype)) - 1
|
||||
inv_idx = jt.empty(mask.shape, dtype=perm.dtype)
|
||||
inv_idx[perm] = imask
|
||||
ret += (inv_idx,)
|
||||
idx = jt.concat([jt.nonzero(mask).view(-1), jt.Var(mask.shape[0])])
|
||||
ret += (idx[1:] - idx[:-1],)
|
||||
return ret
|
||||
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
from gettext import install
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
with open("../README.md", "r") as file:
|
||||
description = file.read()
|
||||
|
||||
print(find_packages())
|
||||
setup(
|
||||
name='JSparse',
|
||||
version='0.1',
|
||||
description=description,
|
||||
packages=find_packages(),
|
||||
install_requires=["jittor", "type_check"]
|
||||
)
|
|
@ -1,96 +0,0 @@
|
|||
import time
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import jittor as jt
|
||||
import jittor.nn as nn
|
||||
from jittor import init
|
||||
from jittor.misc import _pair, _triple
|
||||
|
||||
from itertools import repeat
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
from JSparse import SparseTensor
|
||||
from JSparse import PointTensor
|
||||
from JSparse.utils import make_ntuple
|
||||
from JSparse.nn import functional as F
|
||||
from JSparse.nn.utils import get_kernel_offsets
|
||||
from JSparse.nn.functional import Convolution
|
||||
|
||||
import torchsparse
|
||||
from torchsparse import nn as spnn
|
||||
|
||||
jt.flags.use_cuda = 1
|
||||
|
||||
in_channels = 3
|
||||
out_channels = 64
|
||||
kernel_size = 3
|
||||
stride = 1
|
||||
dilation = 1
|
||||
groups = 1
|
||||
bias = False
|
||||
transposed = False
|
||||
|
||||
kernel_size = _triple(kernel_size)
|
||||
stride = _triple(stride)
|
||||
dilation = _triple(dilation)
|
||||
kernel_volume = int(np.prod(kernel_size))
|
||||
|
||||
N = 10
|
||||
coords = np.random.uniform(0, 10, size=(N, 4))
|
||||
feats = np.random.randn(N, 3)
|
||||
labels = np.random.choice(5, N)
|
||||
print(coords.shape)
|
||||
print(feats.shape)
|
||||
|
||||
coo = jt.Var(coords)
|
||||
val = jt.Var(feats)
|
||||
size = (10, 10, 10)
|
||||
|
||||
fan = (out_channels if transposed else in_channels) * kernel_volume
|
||||
std = 1 / math.sqrt(fan)
|
||||
|
||||
if kernel_volume > 1:
|
||||
weight = init.uniform([kernel_volume, in_channels, out_channels], 'float32', -std, std)
|
||||
else:
|
||||
weight = init.uniform([in_channels, out_channels], 'float32')
|
||||
if bias:
|
||||
bias = init.uniform([out_channels], "float32", -std, std)
|
||||
else:
|
||||
bias = None
|
||||
|
||||
|
||||
x = SparseTensor(coo, val, 1, size)
|
||||
z = PointTensor(x.values, x.indices.float())
|
||||
|
||||
pc_hash = F.sphash(
|
||||
jt.concat([
|
||||
z.indices[:, 0].int().view(-1, 1),
|
||||
jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
|
||||
], 1))
|
||||
sparse_hash = F.sphash(x.indices)
|
||||
idx_query = F.spquery(pc_hash, sparse_hash).int()
|
||||
counts = F.spcount(idx_query, x.indices.shape[0])
|
||||
z.additional_values['idx_query'][x.stride] = idx_query
|
||||
z.additional_values['counts'][x.stride] = counts
|
||||
inserted_values = F.spvoxelize(z.values, idx_query, counts)
|
||||
new_tensor = SparseTensor(inserted_values, x.indices, x.stride, x.size, False)
|
||||
new_tensor.cmaps = x.cmaps
|
||||
new_tensor.kmaps = x.kmaps
|
||||
print(inserted_values)
|
||||
|
||||
offsets = get_kernel_offsets(kernel_size=2, stride=x.stride, dilation=1)
|
||||
old_hash = F.sphash(
|
||||
jt.concat([
|
||||
z.indices[:, 0].int().view(-1, 1),
|
||||
jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
|
||||
], 1), offsets)
|
||||
pc_hash = F.sphash(x.indices)
|
||||
idx_query = F.spquery(old_hash, pc_hash).int()
|
||||
weights = F.calc_ti_weights(z.indices, idx_query,
|
||||
scale=x.stride[0]).t()
|
||||
idx_query = idx_query.t()
|
||||
new_values = F.spdevoxelize(x.values, idx_query, weights)
|
||||
|
||||
print(jt.grad(new_values, x.values))
|
Loading…
Reference in New Issue