Jetson/L4T/TRT Customized Example
This page collects information to deploy customized models with TensorRT and some common questions for Jetson.
Contents
TensorRT Python
OpenCV with ONNX model
Below is an example to deploy TensorRT from an ONNX model with OpenCV images.
Verified environment:
- JetPack4.5.1 + Xavier
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
def Inference(engine):
image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0
np.copyto(host_inputs[0], image.ravel())
stream = cuda.Stream()
context = engine.create_execution_context()
start_time = time.time()
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
print("execute times "+str(time.time()-start_time))
output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
print(np.argmax(output))
def PrepareEngine():
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 30
with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
if not parser.parse(model.read()):
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
engine = builder.build_cuda_engine(network)
# create buffer
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if engine.binding_is_input(binding):
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
return engine
if __name__ == "__main__":
engine = PrepareEngine()
Inference(engine)
OpenCV with PLAN model
Below is an example to deploy TensorRT from a TensorRT PLAN model with OpenCV images.
Verified environment:
- JetPack4.5.1 + Xavier
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=trt.plan
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
def Inference(engine):
image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0
np.copyto(host_inputs[0], image.ravel())
stream = cuda.Stream()
context = engine.create_execution_context()
start_time = time.time()
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
context.execute_async(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
print("execute times "+str(time.time()-start_time))
output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
print(np.argmax(output))
def PrepareEngine():
runtime = trt.Runtime(TRT_LOGGER)
with open('./trt.plan', 'rb') as f:
buf = f.read()
engine = runtime.deserialize_cuda_engine(buf)
# create buffer
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if engine.binding_is_input(binding):
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
return engine
if __name__ == "__main__":
engine = PrepareEngine()
Inference(engine)
Multi-threading
Below is an example to run TensorRT with threads.
Verified environment:
- JetPack4.5.1 + Xavier
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/mnist/mnist.onnx --saveEngine=mnist.trt $ cd /usr/src/tensorrt/data/mnist/ $ sudo pip3 install pillow $ python3 download_pgms.py $ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/my_tensorrt_code.py -O my_tensorrt_code.py
import threading
import time
from my_tensorrt_code import TRTInference, trt
exitFlag = 0
class myThread(threading.Thread):
def __init__(self, func, args):
threading.Thread.__init__(self)
self.func = func
self.args = args
def run(self):
print ("Starting " + self.args[0])
self.func(*self.args)
print ("Exiting " + self.args[0])
if __name__ == '__main__':
# Create new threads
'''
format thread:
- func: function names, function that we wished to use
- arguments: arguments that will be used for the func's arguments
'''
trt_engine_path = 'mnist.trt'
max_batch_size = 1
trt_inference_wrapper = TRTInference(trt_engine_path,
trt_engine_datatype=trt.DataType.FLOAT,
batch_size=max_batch_size)
# Get TensorRT SSD model output
input_img_path = '/usr/src/tensorrt/data/mnist/3.pgm'
thread1 = myThread(trt_inference_wrapper.infer, [input_img_path])
# Start new Threads
thread1.start()
thread1.join()
trt_inference_wrapper.destory();
print ("Exiting Main Thread")
Deepstream
YoloV4 Tiny
Verified environment:
- JetPack4.5.1 + Xavier
Deepstream can reach 60fps with 4 video stream on Xavier:
$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_Yolo $ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/yolov4_tiny.patch $ git apply yolov4_tiny.patch $ export CUDA_VER=10.2 $ make -C nvdsinfer_custom_impl_Yolo
$ wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg -q --show-progress $ wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -q --show-progress $ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/deepstream_app_config_yoloV4_tiny.txt $ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/config_infer_primary_yoloV4_tiny.txt
$ deepstream-app -c deepstream_app_config_yoloV4_tiny.txt
Custom Parser for SSD-MobileNet Trained by Jetson-inference
Verified environment:
- JetPack4.5.1 + Xavier
$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_SSD/ $ sudo wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/ssd-jetson_inference/ssd-jetson_inference.patch $ sudo git apply ssd-jetson_inference.patch $ sudo CUDA_VER=10.2 make -C nvdsinfer_custom_impl_ssd/
Update config_infer_primary_ssd.txt:
Ex.
diff --git a/config_infer_primary_ssd.txt b/config_infer_primary_ssd.txt
index e5bf468..81c52fd 100644
--- a/config_infer_primary_ssd.txt
+++ b/config_infer_primary_ssd.txt
@@ -62,15 +62,13 @@ gpu-id=0
net-scale-factor=0.0078431372
offsets=127.5;127.5;127.5
model-color-format=0
-model-engine-file=sample_ssd_relu6.uff_b1_gpu0_fp32.engine
-labelfile-path=ssd_coco_labels.txt
-uff-file=sample_ssd_relu6.uff
+model-engine-file=ssd-mobilenet.uff_b1_gpu0_fp16.engine
+uff-file=ssd.uff
infer-dims=3;300;300
uff-input-order=0
uff-input-blob-name=Input
-batch-size=1
-## 0=FP32, 1=INT8, 2=FP16 mode
-network-mode=0
+labelfile-path=labels.txt
+network-mode=2
num-detected-classes=91
interval=0
gie-unique-id=1
$ deepstream-app -c deepstream_app_config_ssd.txt
VPI
VPI with Jetson-utils
Below is an example to use VPI with jetson-utils
Verified environment:
- JetPack4.6 + XavierNX
import numpy as np
import jetson.utils
import vpi
display = jetson.utils.glDisplay()
camera = jetson.utils.gstCamera(1920, 1280, '0')
camera.Open()
while display.IsOpen():
frame, width, height = camera.CaptureRGBA(zeroCopy=1)
input = vpi.asimage(np.uint8(jetson.utils.cudaToNumpy(frame)))
with vpi.Backend.CUDA:
output = input.convert(vpi.Format.U8)
output = output.box_filter(11, border=vpi.Border.ZERO).convert(vpi.Format.RGB8)
vpi.clear_cache()
display.RenderOnce(jetson.utils.cudaFromNumpy(output.cpu()), width, height)
display.SetTitle("{:s} | {:d}x{:d} | {:.1f} FPS".format("Camera Viewer", width, height, display.GetFPS()))
camera.Close()
VPI with Deepstream
Please find the following link for the example:
https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20