The implementation so far was prematurely optimized. It had all threads record directly into a shared vector indexed by block_ids. The idea was (1) to avoid the overhead of locking or other synchronization primitives when tracing a multi-thread execution, and (2) to avoid overhead of growing heap buffers. The new implementation is much more straightforward, as is most evident from the fact that it doesn't use relaxed_atomic_store anymore (yet still runs free of TSan errors), and that we were able to remove the ProcessedTrace class. The above-mentioned issues (1) and (2) that drove the earlier design are now addressed as follows in the new design: (1) Each thread now records to its own specific vector of trace entries; these thread-specific vectors are only coalesced into a global vector when dumping a trace. This removed the need for any locking or atomic operations. (2) We are less careful than before about avoiding heap allocations. We just reserve upfront a rather large buffer size, large enough to avoid most subsequent heap reallocations and small enough to still not matter in practical tracing situations. The proximate motivation for this change is that the existing design, requiring indexing of trace entries by block_id, is now inconvenient as we need to experiment with TrMul implementation changes where packing is not necessarily directly associated with a block_ids anymore. PiperOrigin-RevId: 259996147
77 lines
2.7 KiB
C++
77 lines
2.7 KiB
C++
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRACE_H_
|
|
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRACE_H_
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <vector>
|
|
|
|
#include "tensorflow/lite/experimental/ruy/block_map.h"
|
|
|
|
namespace ruy {
|
|
|
|
struct Trace;
|
|
|
|
#ifdef RUY_TRACE
|
|
|
|
struct TracingContext {
|
|
bool initialized = false;
|
|
bool enabled = false;
|
|
int filter_shape_rows = 0;
|
|
int filter_shape_cols = 0;
|
|
int filter_shape_depth = 0;
|
|
Trace* trace = nullptr;
|
|
~TracingContext();
|
|
};
|
|
|
|
Trace* NewTraceOrNull(TracingContext* context, int rows, int depth, int cols);
|
|
void TraceRecordThreadStart(std::uint32_t thread_id, Trace* trace);
|
|
void TraceRecordThreadLoopStart(std::uint32_t thread_id, Trace* trace);
|
|
void TraceRecordBlockReserved(std::uint32_t thread_id, std::uint32_t block_id,
|
|
Trace* trace);
|
|
void TraceRecordBlockPacked(std::uint32_t thread_id, Side side, int block,
|
|
Trace* trace);
|
|
void TraceRecordBlockFinished(std::uint32_t thread_id, std::uint32_t block_id,
|
|
Trace* trace);
|
|
void TraceRecordThreadEnd(std::uint32_t thread_id, Trace* trace);
|
|
void TraceRecordStart(Trace* trace);
|
|
void TraceRecordExecute(const BlockMap& block_map, int thread_count,
|
|
Trace* trace);
|
|
void TraceRecordEnd(Trace* trace);
|
|
|
|
#else
|
|
|
|
struct TracingContext {};
|
|
|
|
inline Trace* NewTraceOrNull(TracingContext*, int, int, int) { return nullptr; }
|
|
inline void TraceRecordThreadStart(std::uint32_t, Trace*) {}
|
|
inline void TraceRecordThreadLoopStart(std::uint32_t, Trace*) {}
|
|
inline void TraceRecordBlockReserved(std::uint32_t, std::uint32_t, Trace*) {}
|
|
inline void TraceRecordBlockPacked(std::uint32_t, Side, int, Trace*) {}
|
|
inline void TraceRecordBlockFinished(std::uint32_t, std::uint32_t, Trace*) {}
|
|
inline void TraceRecordThreadEnd(std::uint32_t, Trace*) {}
|
|
inline void TraceRecordStart(Trace*) {}
|
|
inline void TraceRecordExecute(const BlockMap&, int, Trace*) {}
|
|
inline void TraceRecordEnd(Trace*) {}
|
|
|
|
#endif
|
|
|
|
} // namespace ruy
|
|
|
|
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRACE_H_
|