01
02
准备交叉编译环境
2)安装软件;(以 Ubuntu 为例,其他 Linux 发行版类似)
# 1. Install basic softwareapt update
apt-get install -y --no-install-recommends
gcc g++ git make wget python unzip
# 2. Install arm gcc toolchains
# 由于开发板上使用的gcc版本默认为5.4.0版本,因此这里使用5.4.1版本的arm-gcc
wget https://releases.linaro.org/components/toolchain/binaries/5.4-2017.01/arm-linux-gnueabihf/gcc-linaro-5.4.1-2017.01-x86_64_arm-linux-gnueabihf.tar.xz
tar xf gcc-linaro-5.4.1-2017.01-x86_64_arm-linux-gnueabihf.tar.xz
# 将解压好的arm-gcc添加到环境变量
vi /etc/profile
...
export PATH=$PATH:/home/awcloud/Desktop/gcc-linaro-5.4.1-2017.01-x86_64_arm-linux-gnueabihf/bin
...
# 3. Install cmake 3.10 or above
wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz &&
tar xzf cmake-3.10.3-Linux-x86_64.tar.gz &&
mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 &&
ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake &&
ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
03
更新PaddleLite库
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Litegit checkout <release-version-tag>
curl -L https://paddlelite-demo.bj.bcebos.com/devices/intel/intel_fpga_sdk.tar.gz -o - | tar -zx
2)编译并生成PaddleLite+IntelFPGA的部署库:
./lite/tools/build_linux.sh --arch=armv7hf
--with_extra=ON --with_log=ON --with_intel_fpga=ON
--intel_fpga_sdk_root=./intel_fpga_sdk full_publish
3)将编译生成的
build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/lib/libpaddle_full_api_shared.so文件上传到FPGA开发板/usr/lib目录下;
4)将编译生成的build.lite.armlinux.armv7hf.gcc/inference_lite_lib.armlinux.armv7hf.intel_fpga/cxx/include目录上传到FPGA开发板/opt/Paddlelite目录下。
5)将intel_fpga_sdk/lib/libvnna.so文件上传到FPGA开发板/usr/lib目录下。
04
利用FPGA开发板预测
1)物料准备:
2)预测推理:
将准备好的物料上传到开发板对应位置后进行测试,首先加载fpga驱动程序:
insmod /opt/intelfpgadrv.ko
下面是示例程序:
ll /opt/plite-yolov3
Makefile
plite-test.cpp
其中Makefile文件内容:
TARGET = plite-test
PADDLE_LITE_DIR=/opt/Paddlelite
CFLAGS = -O3 -Wall -std=c++11 `pkg-config --cflags opencv`
CFLAGS += -march=armv7-a -mfloat-abi=hard -mfpu=neon -I$(PADDLE_LITE_DIR)/include
LDFLAGS = -O3 -Wall -std=c++11 `pkg-config --libs opencv`
LDFLAGS += -L$(PADDLE_LITE_DIR)/lib
CC = g++
all: $(TARGET)
$(TARGET): $(TARGET).o
$(CC) $(LDFLAGS) -o $@ $^ `pkg-config --libs opencv` -lpthread -lpaddle_full_api_shared
%.o: %.cpp
$(CC) $(CFLAGS) -c -o $@ $<
clean:
rm -f $(TARGET) *.a *.o *~
plite-test.cpp文件:
#include <arm_neon.h>
#include <opencv2/opencv.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/core/core.hpp>
#include <stdio.h>
#include <sys/time.h>
#include <pthread.h>
#include <unistd.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <limits>
#include "paddle_api.h"
using namespace paddle::lite_api;
using namespace cv;
using namespace std;
const int CPU_THREAD_NUM = 1;
const paddle::lite_api::PowerMode CPU_POWER_MODE =
paddle::lite_api::PowerMode::LITE_POWER_HIGH;
struct Object {
cv::Rect rec;
int class_id;
float prob;
};
int64_t ShapeProduction(const shape_t& shape) {
int64_t res = 1;
for (auto i : shape) res *= i;
return res;
}
inline int64_t get_current_us() {
struct timeval time;
gettimeofday(&time, NULL);
return 1000000LL * (int64_t)time.tv_sec + (int64_t)time.tv_usec;
}
// 预测标签,与数据集对应
const char* class_names[] = { "ca_hua", "ju_pi", "lou_di", "pen_liu", "qi_keng", "qi_pao", "za_se", "zang_dian"
};
void neon_mean_scale(const float* din,
float* dout,
int size,
const std::vector<float> mean,
const std::vector<float> scale) {
if (mean.size() != 3 || scale.size() != 3) {
std::cerr << "[ERROR] mean or scale size must equal to 3n";
exit(1);
}
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
float* dout_c0 = dout;
float* dout_c1 = dout + size;
float* dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) * scale[0];
*(dout_c0++) = (*(din++) - mean[1]) * scale[1];
*(dout_c0++) = (*(din++) - mean[2]) * scale[2];
}
}
// 前处理部分
void pre_process(const cv::Mat& img, int width, int height, float* data) {
cv::Mat rgb_img;
cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
cv::resize(
rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
cv::Mat imgf;
rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
std::vector<float> mean = { 0.485f, 0.456f, 0.406f };
std::vector<float> scale = { 0.229f, 0.224f, 0.225f };
const float* dimg = reinterpret_cast<const float*>(imgf.data);
neon_mean_scale(dimg, data, width * height, mean, scale);
}
// 后处理部分
std::vector<Object> detect_object(const float* data,
int count,
float thresh,
cv::Mat& image) {
if (data == nullptr) {
std::cerr << "[ERROR] data can not be nullptrn";
exit(1);
}
std::vector<Object> rect_out;
for (int iw = 0; iw < count; iw++) {
int oriw = image.cols;
int orih = image.rows;
if (data[1] > thresh) {
Object obj;
int x = static_cast<int>(data[2]);
int y = static_cast<int>(data[3]);
int w = static_cast<int>(data[4] - data[2] + 1);
int h = static_cast<int>(data[5] - data[3] + 1);
cv::Rect rec_clip =
cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
obj.class_id = static_cast<int>(data[0]);
obj.prob = data[1];
obj.rec = rec_clip;
if (w > 0 && h > 0 && obj.prob <= 1) {
rect_out.push_back(obj);
cv::rectangle(image, rec_clip, cv::Scalar(0, 255, 0), 2, cv::LINE_AA);
std::string str_prob = std::to_string(obj.prob);
std::string text = std::string(class_names[obj.class_id]) + ": " +
str_prob.substr(0, str_prob.find(".") + 4);
int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
double font_scale = 1.f;
int thickness = 2;
cv::Size text_size =
cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
float new_font_scale = w * 0.5 * font_scale / text_size.width;
text_size = cv::getTextSize(
text, font_face, new_font_scale, thickness, nullptr);
cv::Point origin;
origin.x = x + 3;
origin.y = y + text_size.height + 3;
cv::putText(image,
text,
cv::Point2d(x,y),
cv::FONT_HERSHEY_PLAIN,
3,
cv::Scalar(0, 0, 255),
thickness,
cv::LINE_AA);
std::cout << "detection, image size: " << image.cols << ", "
<< image.rows
<< ", detect object: " << class_names[obj.class_id]
<< ", score: " << obj.prob << ", location: x=" << x
<< ", y=" << y << ", width=" << w << ", height=" << h
<< std::endl;
}
}
data += 6;
}
return rect_out;
}
cv::Mat RunModel(const cv::Mat& img, std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor) {
const int in_width = 320;
const int in_height = 320;
// input 0
std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
input_tensor0->Resize({ 1, 3, in_height, in_width });
auto* data0 = input_tensor0->mutable_data<float>();
pre_process(img, in_width, in_height, data0);
// input1
std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(1)));
input_tensor1->Resize({ 1, 2 });
auto* data1 = input_tensor1->mutable_data<int>();
data1[0] = img.rows;
data1[1] = img.cols;
predictor->Run();
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
auto* outptr = output_tensor->data<float>();
auto shape_out = output_tensor->shape();
int64_t cnt = 1;
for (auto& i : shape_out) {
cnt *= i;
}
cv::Mat output_image = img.clone();
auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.5f, output_image);
return output_image;
}
int main() {
// 使用FPGA进行推理
std::vector<Place> valid_places({
Place{TARGET(kIntelFPGA), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},185 });
CxxConfig config;
// 加载PaddleX导出模型
config.set_model_file("/opt/inference_model/__model__");
config.set_param_file("/opt/inference_model/__params__");
config.set_valid_places(valid_places);
config.set_threads(CPU_THREAD_NUM);
config.set_power_mode(CPU_POWER_MODE);
std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<CxxConfig>(config);
// 预测推理图片以及推理后保存图片路径
std::string input_image_path = "/opt/images/11.jpg";
std::string output_image_path = "/opt/11_result.jpg";
cv::Mat input_image = cv::imread(input_image_path);
cv::Mat output_image = RunModel(input_image, predictor);
cv::imwrite(output_image_path, output_image);
return 0;
}
3)在开发板上进行编译:
cd /opt/plite-yolov3
make
4)编译成功后:
ll /opt/plite-yolov3
Makefile
plite-test
plite-test.cpp
plite-test.o
5)进行推理;
输入./plite-test指令:
查看推理图片/opt/11_result.jpg,可以看到推理结果与前面PaddleX的推理结果有所区别。
作者:杨振宇、田辉
杨振宇,海云捷迅资深系统架构师,成都信息工程大学计算机应用技术硕士研究生毕业,10余年软件开发和架构经验,熟悉Linux,OpenStack,Kubernetes,Docker等开源技术并具有开源社区贡献经历,在云计算、人工智能、物联网等技术领域有较深的研究和丰富的一线开发经验。
田辉,海云捷迅研发工程师。毕业于湖北大学计信学院,计算机科学与技术专业。4年软件开发经验,熟悉Linux、OpenStack、Kubernetes、Docker等开源技术,在云计算、人工智能等技术领域有一定的开发经验。