gpt4 book ai didi

tensorflow - Nvidia TX1上的TensorFlow

转载 作者:行者123 更新时间:2023-12-04 00:57:45 26 4
gpt4 key购买 nike

有人在Nvidia Tegra X1上使用了tensorflow吗?

我发现一些资料表明TK1上可能存在这种情况,或者TX1上存在严重的黑客入侵/错误,但尚无确定的配方。

  • http://cudamusing.blogspot.de/2015/11/building-tensorflow-for-jetson-tk1.html
  • https://github.com/tensorflow/tensorflow/issues/851

  • 我正在使用Jetson 2.3安装程序,但尚未使它工作-最感谢任何提示。

    最佳答案

    TensorFlow R0.9在带有Bazel 0.2.1,CUDA 8.0,CUDNN5.1,L4T24.2和全新JetPack 2.3的TX1上运行。我已经使用BN,Sigmoid,ReLU等使用基本的MLP,Conv和LSTM网络对其进行了测试,并且没有错误。我删除了sparse_matmul_op,但否则认为编译应该可以完全正常运行。其中许多步骤直接来自MaxCuda's excellent blog,非常感谢他们的提供。

    计划继续对R0.10/R0.11进行锤击(gRPC二进制文件现在阻止了Bazel 0.3.0),但直到那时我才确定要发布R0.9公式。如下:

    先得到java

    sudo add-apt-repository ppa:webupd8team/java
    sudo apt-get update
    sudo apt-get install oracle-java8-installer

    安装其他一些部门
    sudo apt-get install git zip unzip autoconf automake libtool curl zlib1g-dev maven swig

    需要自己构建protobuf 3.0.0-beta-2 jar
    git clone https://github.com/google/protobuf.git
    cd protobuf
    # autogen.sh downloads broken gmock.zip in d5fb408d
    git checkout master
    ./autogen.sh
    git checkout d5fb408d
    ./configure --prefix=/usr
    make -j 4
    sudo make install
    cd java
    mvn package

    换上榛子。我们需要版本0.2.1,与0.3.0不同,它不需要gRPC二进制文件,而我还无法构建(可能很快!)
    git clone https://github.com/bazelbuild/bazel.git
    cd bazel
    git checkout 0.2.1
    cp /usr/bin/protoc third_party/protobuf/protoc-linux-arm32.exe
    cp ../protobuf/java/target/protobuf-java-3.0.0-beta-2.jar third_party/protobuf/protobuf-java-3.0.0-beta-1.jar

    需要编辑bazel文件以将aarch64识别为ARM
    --- a/src/main/java/com/google/devtools/build/lib/util/CPU.java
    +++ b/src/main/java/com/google/devtools/build/lib/util/CPU.java
    @@ -25,7 +25,7 @@ import java.util.Set;
    public enum CPU {
    X86_32("x86_32", ImmutableSet.of("i386", "i486", "i586", "i686", "i786", "x86")),
    X86_64("x86_64", ImmutableSet.of("amd64", "x86_64", "x64")),
    - ARM("arm", ImmutableSet.of("arm", "armv7l")),
    + ARM("arm", ImmutableSet.of("arm", "armv7l", "aarch64")),
    UNKNOWN("unknown", ImmutableSet.<String>of());

    现在编译
    ./compile.sh

    并安装
    sudo cp output/bazel /usr/local/bin

    获取tensorflow R0.9。高于R0.9需要Bazel 0.3.0,由于gRPC问题,我还没有弄清楚该如何构建。
    git clone -b r0.9 https://github.com/tensorflow/tensorflow.git

    建立一次。它将失败,但是现在您有了bazel .cache目录,您可以在其中放置更新的config.guess和config.sub文件,这些文件将确定您正在运行的体系结构
    ./configure
    bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_package

    cd ~
    wget -O config.guess 'http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD'
    wget -O config.sub 'http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD'

    # below are commands I ran, yours will vary depending on .cache details. `find` is your friend
    cp config.guess ./.cache/bazel/_bazel_socialh/742c01ff0765b098544431b60b1eed9f/external/farmhash_archive/farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/config.guess
    cp config.sub ./.cache/bazel/_bazel_socialh/742c01ff0765b098544431b60b1eed9f/external/farmhash_archive/farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/config.sub

    sparse_matmul_op有几个错误,我走了怯弱的路线,从构建中删除了
    --- a/tensorflow/core/kernels/BUILD
    +++ b/tensorflow/core/kernels/BUILD
    @@ -985,7 +985,7 @@ tf_kernel_libraries(
    "reduction_ops",
    "segment_reduction_ops",
    "sequence_ops",
    - "sparse_matmul_op",
    + #DC "sparse_matmul_op",
    ],
    deps = [
    ":bounds_check",

    --- a/tensorflow/python/BUILD
    +++ b/tensorflow/python/BUILD
    @@ -1110,7 +1110,7 @@ medium_kernel_test_list = glob([
    "kernel_tests/seq2seq_test.py",
    "kernel_tests/slice_op_test.py",
    "kernel_tests/sparse_ops_test.py",
    - "kernel_tests/sparse_matmul_op_test.py",
    + #DC "kernel_tests/sparse_matmul_op_test.py",
    "kernel_tests/sparse_tensor_dense_matmul_op_test.py",
    ])

    TX1无法在cwise_op_gpu_select.cu.cc中执行漂亮的构造函数
    --- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
    +++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
    @@ -43,8 +43,14 @@ struct BatchSelectFunctor<GPUDevice, T> {
    const int all_but_batch = then_flat_outer_dims.dimension(1);

    #if !defined(EIGEN_HAS_INDEX_LIST)
    - Eigen::array<int, 2> broadcast_dims{{ 1, all_but_batch }};
    - Eigen::Tensor<int, 2>::Dimensions reshape_dims{{ batch, 1 }};
    + //DC Eigen::array<int, 2> broadcast_dims{{ 1, all_but_batch }};
    + Eigen::array<int, 2> broadcast_dims;
    + broadcast_dims[0] = 1;
    + broadcast_dims[1] = all_but_batch;
    + //DC Eigen::Tensor<int, 2>::Dimensions reshape_dims{{ batch, 1 }};
    + Eigen::Tensor<int, 2>::Dimensions reshape_dims;
    + reshape_dims[0] = batch;
    + reshape_dims[1] = 1;
    #else
    Eigen::IndexList<Eigen::type2index<1>, int> broadcast_dims;
    broadcast_dims.set(1, all_but_batch);

    sparse_tensor_dense_matmul_op_gpu.cu.cc中的内容相同
    --- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
    +++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
    @@ -104,9 +104,17 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B> {
    int n = (ADJ_B) ? b.dimension(0) : b.dimension(1);

    #if !defined(EIGEN_HAS_INDEX_LIST)
    - Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
    - Eigen::array<int, 2> n_by_1{{ n, 1 }};
    - Eigen::array<int, 1> reduce_on_rows{{ 0 }};
    + //DC Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
    + Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz;
    + matrix_1_by_nnz[0] = 1;
    + matrix_1_by_nnz[1] = nnz;
    + //DC Eigen::array<int, 2> n_by_1{{ n, 1 }};
    + Eigen::array<int, 2> n_by_1;
    + n_by_1[0] = n;
    + n_by_1[1] = 1;
    + //DC Eigen::array<int, 1> reduce_on_rows{{ 0 }};
    + Eigen::array<int, 1> reduce_on_rows;
    + reduce_on_rows[0] = 0;
    #else
    Eigen::IndexList<Eigen::type2index<1>, int> matrix_1_by_nnz;
    matrix_1_by_nnz.set(1, nnz);

    与CUDA 8.0一起运行需要FP16的新宏。非常感谢Kashif/Mrry指出了解决方法!
    --- a/tensorflow/stream_executor/cuda/cuda_blas.cc
    +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
    @@ -25,6 +25,12 @@ limitations under the License.
    #define EIGEN_HAS_CUDA_FP16
    #endif

    +#if CUDA_VERSION >= 8000
    +#define SE_CUDA_DATA_HALF CUDA_R_16F
    +#else
    +#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF
    +#endif
    +
    #include "tensorflow/stream_executor/cuda/cuda_blas.h"

    #include <dlfcn.h>
    @@ -1680,10 +1686,10 @@ bool CUDABlas::DoBlasGemm(
    return DoBlasInternal(
    dynload::cublasSgemmEx, stream, true /* = pointer_mode_host */,
    CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
    - CUDAMemory(a), CUBLAS_DATA_HALF, lda,
    - CUDAMemory(b), CUBLAS_DATA_HALF, ldb,
    + CUDAMemory(a), SE_CUDA_DATA_HALF, lda,
    + CUDAMemory(b), SE_CUDA_DATA_HALF, ldb,
    &beta,
    - CUDAMemoryMutable(c), CUBLAS_DATA_HALF, ldc);
    + CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
    #else
    LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
    << "(need at least CUDA 7.5)";

    最后,ARM没有NUMA节点,因此需要添加它,否则在启动tf.Session()时会立即崩溃。
    --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
    +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
    @@ -888,6 +888,9 @@ CudaContext* CUDAExecutor::cuda_context() { return context_; }
    // For anything more complicated/prod-focused than this, you'll likely want to
    // turn to gsys' topology modeling.
    static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
    + // DC - make this clever later. ARM has no NUMA node, just return 0
    + LOG(INFO) << "ARM has no NUMA node, hardcoding to return zero";
    + return 0;
    #if defined(__APPLE__)
    LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
    return 0;

    完成这些更改后,构建并安装!希望这对某些人有用。

    关于tensorflow - Nvidia TX1上的TensorFlow,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/39783919/

    26 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com