diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 48b59750a1c..8d9ddb97d9e 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -742,6 +742,12 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate( offset = std::max(offset, RoundUpToNearest(chunk.chunk_end(), alignment_)); } use_free_chunk_if_smaller(offset, result_.heap_size - offset); + // When preferred offset is provided and the preferred offset is larger than + // the current heap size, simply use the preferred offset provided. + if (result_.heap_size <= preferred_offset) { + chunk_candidate.heap_size = preferred_offset + buffer_interval.size; + min_fit_chunk = {preferred_offset, buffer_interval.size}; + } if (min_fit_chunk.offset == -1) { // Increase the heap size to fit in the last free chunk. diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc index 06d0a0c67e1..7dd6686bcea 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc @@ -19,8 +19,8 @@ namespace xla { namespace { // Define a dummy chunk for chunks that will be allocated in the default memory -// space. -const HeapSimulator::Chunk kDefaultMemorySpaceDummyChunk{-1, -1}; +// space and for keeping track of number of asynchronous copies. +const HeapSimulator::Chunk kDummyChunk{-1, -1}; } // namespace std::vector @@ -91,12 +91,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { MemorySpaceAssignment::AllocationSequence* allocation_sequence = &(*allocation_map_)[&buffer]; - if (keep_in_default_memory) { - continue; - } // At this point, none of the colocated buffers contain any phi buffers. for (const BufferInterval* colocated_interval : colocated_intervals) { + if (keep_in_default_memory) { + break; + } const HloValue* value = colocated_interval->buffer; int64 definition_time = instruction_schedule_->at(value->defining_instruction()); @@ -114,15 +114,27 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { // Skip allocating buffers for bitcast uses. The uses that feed from // bitcasts will be handled specially. if (use.instruction->opcode() != HloOpcode::kBitcast) { - FindAllocation(definition_time, use_time, value->defining_position(), - use, value, colocated_interval->size, - allocation_sequence); + if (!FindAllocation(definition_time, use_time, + value->defining_position(), use, value, + colocated_interval->size, allocation_sequence)) { + // If the allocation finding failed (e.g., due to running out of + // asynchronous copies), then fall back to allocating the buffer + // entirely in the default memory. + pending_chunks_.clear(); + pending_async_copies_.clear(); + allocation_sequence->clear(); + keep_in_default_memory = true; + break; + } + // If there are multiple uses, they can try using the memory // allocation already at the alternate memory. definition_time = use_time; } } } + + CommitPendingChunks(); } if (VLOG_IS_ON(3)) { @@ -147,7 +159,32 @@ HloInstruction* AlternateMemoryBestFitHeap::GetInstructionAt(int64 time) const { return flattened_instruction_sequence_->instructions()[time]; } -void AlternateMemoryBestFitHeap::FindAllocation( +void AlternateMemoryBestFitHeap::CommitPendingChunks() { + for (auto interval_and_chunk : pending_chunks_) { + VLOG(3) << "Committing chunk: " << interval_and_chunk.first.start << "-" + << interval_and_chunk.first.end << " : [" + << interval_and_chunk.second.chunk.offset << ", " + << interval_and_chunk.second.chunk.size << "]"; + CommitChunk(interval_and_chunk.first, interval_and_chunk.second); + } + pending_chunks_.clear(); + // Also add the pending async copies to the interval tree. + if (max_outstanding_async_copies_ >= 0) { + for (auto interval : pending_async_copies_) { + async_copy_interval_tree_.Add(interval.first, interval.second, + kDummyChunk); + } + } + pending_async_copies_.clear(); +} + +void AlternateMemoryBestFitHeap::AddToPendingChunks( + const BufferInterval& buffer_interval, + const ChunkCandidate& chunk_candidate) { + pending_chunks_.emplace_back(buffer_interval, chunk_candidate); +} + +bool AlternateMemoryBestFitHeap::FindAllocation( int64 start_time, int64 end_time, HloPosition defining_position, HloUse use, const HloValue* buffer, int64 size, MemorySpaceAssignment::AllocationSequence* allocations) { @@ -181,7 +218,7 @@ void AlternateMemoryBestFitHeap::FindAllocation( if (TryAllocatingInAlternateMemoryNoCopy( start_time, end_time, defining_position, use, alternate_mem_interval, non_bitcast_operand, allocations)) { - return; + return true; } MemorySpaceAssignment::Allocation* prev_allocation = nullptr; @@ -199,26 +236,46 @@ void AlternateMemoryBestFitHeap::FindAllocation( // TODO(berkin): For now evictions happen relative to the most recent // allocation in the alternate memory. We can potentially start evictions // earlier and end later. - HloInstruction* earliest_instruction = - GetInstructionAt(prev_allocation->start_time()); - HloInstruction* latest_instruction = - GetInstructionAt(prev_allocation->end_time()); - VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " (" << prev_allocation->start_time() << ", " << prev_allocation->end_time() << ")"; - VLOG(3) << "Copy to default mem between instructions " - << earliest_instruction->ToString() << " - " - << latest_instruction->ToString(); - // The live range of this buffer is from the start time of the previous - // buffer that was in the alternate memory so that a buffer is allocated - // during the copy. - allocations->push_back( - absl::make_unique( - *prev_allocation, MemorySpace::kDefault, - kDefaultMemorySpaceDummyChunk, prev_allocation->start_time(), - end_time, earliest_instruction, latest_instruction)); + // See if this interval would violate the asynchronous copy limit. + if (!ViolatesMaximumOutstandingAsyncCopies(prev_allocation->start_time(), + prev_allocation->end_time())) { + AddAsyncCopy(*prev_allocation, MemorySpace::kDefault, kDummyChunk, + prev_allocation->start_time(), prev_allocation->end_time(), + allocations); + + } else { + VLOG(3) << "This violates the maximum async copies."; + // If the original interval violated the limit, try sub-intervals within + // this interval. + bool eviction_scheduled = false; + for (int64 time = prev_allocation->start_time(); + time <= prev_allocation->end_time(); ++time) { + VLOG(3) << "Try evicting (" << time << ", " << time << ")"; + if (!ViolatesMaximumOutstandingAsyncCopies(time, time)) { + VLOG(3) << "Eviction successful."; + AddAsyncCopy(*prev_allocation, MemorySpace::kDefault, kDummyChunk, + time, time, allocations); + eviction_scheduled = true; + break; + } + } + + if (!eviction_scheduled) { + // If the eviction couldn't be scheduled, then fail. This buffer will be + // kept in the default memory. + VLOG(3) << "Bailing: Could not evict " << use.ToString() + << " because we hit the limit of maximum asynchronous copies " + << "between " + << GetInstructionAt(prev_allocation->start_time())->ToString() + << " and " + << GetInstructionAt(prev_allocation->end_time())->ToString(); + return false; + } + } } else if (prev_allocation != nullptr && prev_allocation->memory_space() == MemorySpace::kDefault && prev_allocation->instruction() == non_bitcast_operand) { @@ -229,7 +286,7 @@ void AlternateMemoryBestFitHeap::FindAllocation( } else { allocations->push_back(absl::make_unique( non_bitcast_operand, defining_position, MemorySpace::kDefault, - kDefaultMemorySpaceDummyChunk, start_time, end_time)); + kDummyChunk, start_time, end_time)); } // Try partially placing the buffer in the alternate space. The time that is @@ -252,35 +309,81 @@ void AlternateMemoryBestFitHeap::FindAllocation( VLOG(4) << "Trying alternate memory allocation (" << alternate_mem_interval.start << ", " << alternate_mem_interval.end << ")"; + // If this additional asynchronous copy would violate the limit, try a + // different interval. + if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start, + alternate_mem_interval.end)) { + VLOG(4) << "This would violate the outstanding async copy limit."; + continue; + } ChunkCandidate chunk_candidate = FindChunkCandidate(alternate_mem_interval); // Check if the new heap size fits within limits. if (chunk_candidate.heap_size < max_size_in_bytes_) { - HloInstruction* earliest_instruction = - GetInstructionAt(alternate_mem_interval.start); VLOG(3) << "Move the buffer to alternate memory at " << alternate_mem_interval.start << ". Offset = " << chunk_candidate.chunk.offset << ", size = " << chunk_candidate.chunk.size << ", heap_size = " << chunk_candidate.heap_size; - VLOG(3) << "Copy to alternate mem between instructions " - << earliest_instruction->ToString() << " - " - << use.instruction->ToString(); - CommitChunk(alternate_mem_interval, chunk_candidate); + AddToPendingChunks(alternate_mem_interval, chunk_candidate); + + AddAsyncCopy(*allocations->back().get(), MemorySpace::kAlternate, + chunk_candidate.chunk, alternate_mem_interval.start, + end_time, allocations); - // Since copies couldn't be removed, create an allocation in the - // default memory space. - allocations->push_back( - absl::make_unique( - *allocations->back().get(), MemorySpace::kAlternate, - chunk_candidate.chunk, alternate_mem_interval.start, end_time, - earliest_instruction, use.instruction)); allocations->back()->AddUse(use); - return; + return true; } } // If a copy wasn't inserted, then add this use to the latest allocation. allocations->back()->AddUse(use); + return true; +} + +void AlternateMemoryBestFitHeap::AddAsyncCopy( + const MemorySpaceAssignment::Allocation& prev_allocation, + MemorySpace memory_space, Chunk chunk, int64 start_time, int64 end_time, + MemorySpaceAssignment::AllocationSequence* allocations) { + HloInstruction* earliest_instruction = GetInstructionAt(start_time); + HloInstruction* latest_instruction = GetInstructionAt(end_time); + + VLOG(3) << "Copy to " + << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault + ? "default" + : "alternate") + << " memory between instructions " << earliest_instruction->ToString() + << " - " << latest_instruction->ToString(); + + allocations->push_back( + absl::make_unique( + prev_allocation, memory_space, chunk, start_time, end_time, + earliest_instruction, latest_instruction)); + + // Register the additional async copy with the interval tree to keep track of + // the limit at any given time. + pending_async_copies_.emplace_back(start_time, end_time); +} + +bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies( + int64 start_time, int64 end_time) const { + if (max_outstanding_async_copies_ < 0) { + return false; + } + + // Count both the asynchronous copies in the interval tree as well as the + // pending asynchronous copies belonging to this buffer. + int64 num_async_copies = + async_copy_interval_tree_.ChunksOverlappingInTime(start_time, end_time) + .size(); + + for (auto interval : pending_async_copies_) { + if (interval.second > start_time && interval.first < end_time) { + num_async_copies++; + } + } + // Add one because we are checking if adding an additional asynchronous copy + // would violate the limit. + return num_async_copies + 1 > max_outstanding_async_copies_; } bool AlternateMemoryBestFitHeap::TryAllocatingInAlternateMemoryNoCopy( @@ -332,7 +435,7 @@ bool AlternateMemoryBestFitHeap::TryAllocatingInAlternateMemoryNoCopy( << chunk_candidate.chunk.offset << ", size = " << chunk_candidate.chunk.size << ", heap_size = " << chunk_candidate.heap_size; - CommitChunk(alternate_mem_interval, chunk_candidate); + AddToPendingChunks(alternate_mem_interval, chunk_candidate); // If there was a previous allocation, the buffer location is the // same as the previous. Otherwise, it is the operand. @@ -351,6 +454,22 @@ bool AlternateMemoryBestFitHeap::TryAllocatingInAlternateMemoryNoCopy( return false; } +/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies( + const HloModule& module) { + int64 max_copies = 0; + int64 current_copies = 0; + for (HloInstruction* instruction : + module.schedule().sequence(module.entry_computation()).instructions()) { + if (instruction->opcode() == HloOpcode::kCopyStart) { + current_copies++; + } else if (instruction->opcode() == HloOpcode::kCopyDone) { + current_copies--; + } + max_copies = std::max(max_copies, current_copies); + } + return max_copies; +} + /*static*/ StatusOr> MemorySpaceAssignment::Run( HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes, @@ -358,7 +477,8 @@ MemorySpaceAssignment::Run( int64 alternate_memory_space_alignment_in_bytes, BufferValue::SizeFunction size_fn, AlternateMemoryBestFitHeap::IsAllowedInAlternateMemoryFunction - is_allowed_in_alternate_mem) { + is_allowed_in_alternate_mem, + int64 max_outstanding_async_copies) { CHECK(module->has_schedule()); VLOG(4) << "Module before memory space assignment: "; XLA_VLOG_LINES(4, module->ToString()); @@ -372,7 +492,7 @@ MemorySpaceAssignment::Run( min_prefetch_interval, max_prefetch_interval, *alias_analysis, alternate_memory_space_alignment_in_bytes, GlobalDecreasingSizeBestFitHeap::Type::kSpatial, - is_allowed_in_alternate_mem); + is_allowed_in_alternate_mem, max_outstanding_async_copies); TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module, module->schedule(), @@ -385,6 +505,8 @@ MemorySpaceAssignment::Run( VLOG(4) << "Module after memory space assignment: "; XLA_VLOG_LINES(4, module->ToString()); TF_CHECK_OK(module->schedule().Verify()); + VLOG(1) << "Maximum number of outstanding async copies: " + << CountMaximumOutstandingAsyncCopies(*module); return std::move(memory_space_assignment.preset_assignments_); } diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h index 0816eeec481..71ed39ded04 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.h +++ b/tensorflow/compiler/xla/service/memory_space_assignment.h @@ -200,6 +200,8 @@ class MemorySpaceAssignment { // in the alternate memory space, size_fn is the size function for buffer // values, and is_allowed_in_alternate_mem can be used to prevent certain // HloValues (e.g., based on the opcode) to be placed on the alternate memory. + // max_outstanding_async_copies specifies the upper bound for number of + // outstanding asynchronous copies, -1 for unlimited. // TODO(berkin): Use the cost model instead of using number of instructions to // decide how early to prefetch. static StatusOr> Run( @@ -207,7 +209,12 @@ class MemorySpaceAssignment { int64 min_prefetch_interval, int64 max_prefetch_interval, int64 alternate_memory_space_alignment_in_bytes, BufferValue::SizeFunction size_fn, - std::function is_allowed_in_alternate_mem); + std::function is_allowed_in_alternate_mem, + int64 max_outstanding_async_copies = -1); + + // Returns the maximum number of outstanding asynchronous copies in the + // module. + static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module); private: MemorySpaceAssignment(HloModule* module, int64 alternate_memory_space) @@ -265,14 +272,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { int64 max_size_in_bytes, int64 min_prefetch_interval, int64 max_prefetch_interval, const HloAliasAnalysis& alias_analysis, int64 alignment, GlobalDecreasingSizeBestFitHeap::Type type, - IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem) + IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem, + int64 max_outstanding_async_copies) : GlobalDecreasingSizeBestFitHeap(alignment, type), allocation_map_(allocation_map), max_size_in_bytes_(max_size_in_bytes), min_prefetch_interval_(min_prefetch_interval), max_prefetch_interval_(max_prefetch_interval), alias_analysis_(alias_analysis), - is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem) {} + is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem), + max_outstanding_async_copies_(max_outstanding_async_copies) {} HeapSimulator::Result Finish() override; @@ -281,8 +290,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { // find a suitable chunk candidate within the heap size and prefetch interval // limits, and append the new allocation(s) to allocations. The new // allocations can be in default or alternate memory spaces, or can be - // prefetches or evictions. - void FindAllocation(int64 start_time, int64 end_time, + // prefetches or evictions. Returns true if successful. + bool FindAllocation(int64 start_time, int64 end_time, HloPosition defining_position, HloUse use, const HloValue* buffer, int64 size, MemorySpaceAssignment::AllocationSequence* allocations); @@ -310,6 +319,23 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { // unnecessarily adding the chunk to the chunk map. void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {} + // Returns true if the addition of an asynchronous copy in the given time + // interval would violate the maximum number of asynchronous copies. + bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time, + int64 end_time) const; + + // Adds an asynchronous copy to the allocations. + void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation, + MemorySpace memory_space, Chunk chunk, int64 start_time, + int64 end_time, + MemorySpaceAssignment::AllocationSequence* allocations); + + // These methods are used for delaying committing the chunk candidate until + // the entire live range of the buffer has been considered. + void AddToPendingChunks(const BufferInterval& buffer_interval, + const ChunkCandidate& chunk_candidate); + void CommitPendingChunks(); + MemorySpaceAssignment::AllocationMap* allocation_map_; int64 max_size_in_bytes_; // The min and max prefetch intervals decribe the number of independent HLOs @@ -328,6 +354,12 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { int64 max_prefetch_interval_; const HloAliasAnalysis& alias_analysis_; IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_; + // We use a interval tree to keep track of the number of outstanding + // asynchronous copies. + BufferIntervalTree async_copy_interval_tree_; + int64 max_outstanding_async_copies_; + std::vector> pending_chunks_; + std::vector> pending_async_copies_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc index a7d70e915dc..99ce46c0799 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc @@ -31,7 +31,8 @@ class MemorySpaceAssignmentTest : public HloTestBase { const int64 kDefaultMemorySpace = 0; const int64 kAlternateMemorySpace = 1; - std::unique_ptr AssignMemorySpace(HloModule* module) { + std::unique_ptr AssignMemorySpace( + HloModule* module, int64 max_outstanding_async_copies = -1) { auto size_fn = [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8); }; @@ -56,7 +57,7 @@ class MemorySpaceAssignmentTest : public HloTestBase { /*min_prefetch_interval=*/2, /*max_prefetch_interval=*/10, /*alternate_memory_space_alignment_in_bytes=*/8, size_fn, - is_allowed_in_alternate_mem) + is_allowed_in_alternate_mem, max_outstanding_async_copies) .ValueOrDie(); CheckPresetAssignments(preset_assignments.get()); return preset_assignments; @@ -80,6 +81,65 @@ class MemorySpaceAssignmentTest : public HloTestBase { << position.ToString(); } } + + std::unique_ptr CreateEvictAndPrefetchModule() { + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); + HloInstruction* p0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + HloInstruction* p1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + HloInstruction* tanh = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kTanh, p0)); + // tanh should be placed in the alternate memory since there isn't much + // contention in the beginning. However, tanh has another consumer at the + // end. So it should be kicked out to default memory and prefetched back in. + // The graph below is meant to increase the contention to force + // eviction/prefetch behavior. + HloInstruction* a = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, tanh)); + HloInstruction* b = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); + HloInstruction* c = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1)); + HloInstruction* d = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); + HloInstruction* e = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, b)); + HloInstruction* f = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, c)); + HloInstruction* g = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, d)); + HloInstruction* h = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, c)); + HloInstruction* i = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, d)); + HloInstruction* j = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, c, d)); + HloInstruction* k = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, e, f)); + HloInstruction* l = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, g, h)); + HloInstruction* m = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, i, j)); + HloInstruction* n = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, k, l)); + HloInstruction* o = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, n, m)); + // tanh is being used at the root instruction, and this should be + // prefetched. + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, o, tanh)); + + auto module = CreateNewVerifiedModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i, + j, k, l, m, n, o, add}); + TF_CHECK_OK(module->set_schedule(schedule)); + return module; + } }; TEST_F(MemorySpaceAssignmentTest, ParameterOnly) { @@ -141,8 +201,11 @@ TEST_F(MemorySpaceAssignmentTest, Simple) { EXPECT_THAT(sub, op::ShapeWithLayout(shape_in_alternate_mem)); // Make sure the preset assignments is sane. - EXPECT_THAT(preset_assignments->chunks().size(), 2); - EXPECT_THAT(preset_assignments->sizes().size(), 1); + EXPECT_EQ(preset_assignments->chunks().size(), 2); + EXPECT_EQ(preset_assignments->sizes().size(), 1); + // Ensure the offset assigned to add and sub are different. + EXPECT_NE(preset_assignments->chunks()[0].second.offset, + preset_assignments->chunks()[1].second.offset); } TEST_F(MemorySpaceAssignmentTest, NegateChain) { @@ -209,69 +272,37 @@ TEST_F(MemorySpaceAssignmentTest, NegateChain) { } TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetch) { - HloComputation::Builder builder(TestName()); - Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); - HloInstruction* p0 = - builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); - HloInstruction* p1 = - builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); - HloInstruction* tanh = builder.AddInstruction( - HloInstruction::CreateUnary(shape, HloOpcode::kTanh, p0)); - // tanh should be placed in the alternate memory since there isn't much - // contention in the beginning. However, tanh has another consumer at the end. - // So it should be kicked out to default memory and prefetched back in. - // The graph below is meant to increase the contention to force - // eviction/prefetch behavior. - HloInstruction* a = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, tanh)); - HloInstruction* b = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); - HloInstruction* c = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1)); - HloInstruction* d = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); - HloInstruction* e = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, b)); - HloInstruction* f = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, c)); - HloInstruction* g = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, d)); - HloInstruction* h = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, c)); - HloInstruction* i = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, d)); - HloInstruction* j = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, c, d)); - HloInstruction* k = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, e, f)); - HloInstruction* l = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, g, h)); - HloInstruction* m = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, i, j)); - HloInstruction* n = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, k, l)); - HloInstruction* o = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, n, m)); - // tanh is being used at the root instruction, and this should be prefetched. - HloInstruction* add = builder.AddInstruction( - HloInstruction::CreateBinary(shape, HloOpcode::kAdd, o, tanh)); - - auto module = CreateNewVerifiedModule(); - HloComputation* computation = module->AddEntryComputation(builder.Build()); - - HloSchedule schedule(module.get()); - schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i, - j, k, l, m, n, o, add}); - TF_CHECK_OK(module->set_schedule(schedule)); + std::unique_ptr module = CreateEvictAndPrefetchModule(); AssignMemorySpace(module.get()); EXPECT_THAT( - add, + module->entry_computation()->root_instruction(), op::Add(op::Add(), op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace, op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace, op::Tanh())))); + + EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module), + 2); +} + +TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) { + std::unique_ptr module = CreateEvictAndPrefetchModule(); + + AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0); + + EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module), + 0); +} + +TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) { + std::unique_ptr module = CreateEvictAndPrefetchModule(); + + AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1); + + EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module), + 1); } TEST_F(MemorySpaceAssignmentTest, While) { diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files index dd5df86a2a4..a30c0ec2525 100644 --- a/tensorflow/opensource_only.files +++ b/tensorflow/opensource_only.files @@ -11,15 +11,15 @@ tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh tensorflow/python/tpu/profiler/pip_package/setup.py tensorflow/stream_executor/build_defs.bzl tensorflow/third_party/BUILD -tensorflow/third_party/__init__.py tensorflow/third_party/android/BUILD tensorflow/third_party/android/android.bzl.tpl tensorflow/third_party/android/android_configure.BUILD.tpl tensorflow/third_party/android/android_configure.bzl +tensorflow/third_party/__init__.py tensorflow/third_party/arm_neon_2_x86_sse.BUILD tensorflow/third_party/astor.BUILD -tensorflow/third_party/backports_weakref.BUILD tensorflow/third_party/boringssl/BUILD +tensorflow/third_party/backports_weakref.BUILD tensorflow/third_party/clang_toolchain/BUILD tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl tensorflow/third_party/clang_toolchain/download_clang.bzl @@ -27,86 +27,86 @@ tensorflow/third_party/codegen.BUILD tensorflow/third_party/com_google_absl.BUILD tensorflow/third_party/common.bzl tensorflow/third_party/cub.BUILD -tensorflow/third_party/cython.BUILD tensorflow/third_party/curl.BUILD -tensorflow/third_party/eigen.BUILD +tensorflow/third_party/cython.BUILD tensorflow/third_party/double_conversion.BUILD -tensorflow/third_party/eigen3/BUILD -tensorflow/third_party/eigen3/Eigen/Core tensorflow/third_party/eigen3/Eigen/Cholesky +tensorflow/third_party/eigen3/Eigen/Core tensorflow/third_party/eigen3/Eigen/Eigenvalues tensorflow/third_party/eigen3/Eigen/LU tensorflow/third_party/eigen3/Eigen/QR tensorflow/third_party/eigen3/Eigen/SVD +tensorflow/third_party/eigen3/BUILD tensorflow/third_party/eigen3/LICENSE tensorflow/third_party/eigen3/gpu_packet_math.patch +tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor -tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h -tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h -tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h +tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h +tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h +tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h -tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions -tensorflow/third_party/enum34.BUILD -tensorflow/third_party/farmhash.BUILD +tensorflow/third_party/eigen.BUILD tensorflow/third_party/fft2d/BUILD tensorflow/third_party/fft2d/LICENSE -tensorflow/third_party/fft2d/fft2d.BUILD tensorflow/third_party/fft2d/fft.h +tensorflow/third_party/fft2d/fft2d.BUILD tensorflow/third_party/fft2d/fft2d.h +tensorflow/third_party/enum34.BUILD +tensorflow/third_party/farmhash.BUILD +tensorflow/third_party/git/BUILD +tensorflow/third_party/git/BUILD.tpl +tensorflow/third_party/git/git_configure.bzl tensorflow/third_party/functools32.BUILD tensorflow/third_party/gast.BUILD tensorflow/third_party/gif.BUILD -tensorflow/third_party/git/BUILD.tpl -tensorflow/third_party/git/BUILD -tensorflow/third_party/git/git_configure.bzl -tensorflow/third_party/googleapis.BUILD -tensorflow/third_party/gpus/BUILD tensorflow/third_party/gpus/crosstool/BUILD tensorflow/third_party/gpus/crosstool/BUILD.tpl tensorflow/third_party/gpus/crosstool/LICENSE tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl +tensorflow/third_party/gpus/BUILD tensorflow/third_party/gpus/cuda/BUILD -tensorflow/third_party/gpus/cuda/BUILD.tpl tensorflow/third_party/gpus/cuda/BUILD.windows.tpl -tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl +tensorflow/third_party/gpus/cuda/BUILD.tpl tensorflow/third_party/gpus/cuda/LICENSE +tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl tensorflow/third_party/gpus/cuda/cuda_config.h.tpl -tensorflow/third_party/gpus/cuda_configure.bzl tensorflow/third_party/gpus/rocm/BUILD -tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl tensorflow/third_party/gpus/rocm/BUILD.tpl tensorflow/third_party/gpus/rocm/rocm_config.h.tpl +tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl +tensorflow/third_party/gpus/cuda_configure.bzl tensorflow/third_party/gpus/find_cuda_config.py tensorflow/third_party/gpus/rocm_configure.bzl +tensorflow/third_party/googleapis.BUILD tensorflow/third_party/grpc/BUILD tensorflow/third_party/icu/udata.patch -tensorflow/third_party/jsoncpp.BUILD -tensorflow/third_party/kafka/config.patch tensorflow/third_party/kafka/BUILD -tensorflow/third_party/libxsmm.BUILD -tensorflow/third_party/linenoise.BUILD +tensorflow/third_party/kafka/config.patch +tensorflow/third_party/jsoncpp.BUILD tensorflow/third_party/llvm/BUILD tensorflow/third_party/llvm/expand_cmake_vars.py tensorflow/third_party/llvm/llvm.autogenerated.BUILD tensorflow/third_party/llvm/llvm.bzl -tensorflow/third_party/mkl/LICENSE +tensorflow/third_party/libxsmm.BUILD +tensorflow/third_party/linenoise.BUILD +tensorflow/third_party/lmdb.BUILD tensorflow/third_party/mkl/BUILD +tensorflow/third_party/mkl/LICENSE tensorflow/third_party/mkl/MKL_LICENSE tensorflow/third_party/mkl/build_defs.bzl tensorflow/third_party/mkl/mkl.BUILD -tensorflow/third_party/lmdb.BUILD -tensorflow/third_party/mkl_dnn/mkldnn.BUILD tensorflow/third_party/mkl_dnn/LICENSE +tensorflow/third_party/mkl_dnn/mkldnn.BUILD tensorflow/third_party/mpi/.gitignore tensorflow/third_party/mpi/BUILD tensorflow/third_party/mpi_collectives/BUILD @@ -121,89 +121,87 @@ tensorflow/third_party/nccl/system.BUILD.tpl tensorflow/third_party/ngraph/BUILD tensorflow/third_party/ngraph/LICENSE tensorflow/third_party/ngraph/NGRAPH_LICENSE +tensorflow/third_party/ngraph/ngraph_tf.BUILD tensorflow/third_party/ngraph/build_defs.bzl tensorflow/third_party/ngraph/ngraph.BUILD tensorflow/third_party/ngraph/nlohmann_json.BUILD -tensorflow/third_party/ngraph/ngraph_tf.BUILD tensorflow/third_party/ngraph/tbb.BUILD tensorflow/third_party/opt_einsum.BUILD tensorflow/third_party/pcre.BUILD tensorflow/third_party/png.BUILD tensorflow/third_party/png_fix_rpi.patch -tensorflow/third_party/pprof.BUILD tensorflow/third_party/protobuf/BUILD -tensorflow/third_party/py/BUILD.tpl -tensorflow/third_party/py/BUILD +tensorflow/third_party/pprof.BUILD tensorflow/third_party/py/numpy/BUILD +tensorflow/third_party/py/BUILD +tensorflow/third_party/py/BUILD.tpl tensorflow/third_party/py/python_configure.bzl -tensorflow/third_party/pybind11.BUILD tensorflow/third_party/python_runtime/BUILD +tensorflow/third_party/pybind11.BUILD tensorflow/third_party/repo.bzl tensorflow/third_party/six.BUILD tensorflow/third_party/snappy.BUILD tensorflow/third_party/sqlite.BUILD tensorflow/third_party/swig.BUILD tensorflow/third_party/sycl/crosstool/BUILD -tensorflow/third_party/systemlibs/BUILD tensorflow/third_party/systemlibs/BUILD.tpl +tensorflow/third_party/systemlibs/BUILD tensorflow/third_party/systemlibs/absl_py.BUILD -tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD +tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD tensorflow/third_party/systemlibs/astor.BUILD -tensorflow/third_party/systemlibs/build_defs.bzl.tpl tensorflow/third_party/systemlibs/boringssl.BUILD -tensorflow/third_party/systemlibs/cython.BUILD +tensorflow/third_party/systemlibs/build_defs.bzl.tpl tensorflow/third_party/systemlibs/curl.BUILD +tensorflow/third_party/systemlibs/cython.BUILD tensorflow/third_party/systemlibs/double_conversion.BUILD tensorflow/third_party/systemlibs/gast.BUILD tensorflow/third_party/systemlibs/gif.BUILD -tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD -tensorflow/third_party/systemlibs/grpc.BUILD +tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD tensorflow/third_party/systemlibs/googleapis.BUILD -tensorflow/third_party/systemlibs/lmdb.BUILD -tensorflow/third_party/systemlibs/nsync.BUILD tensorflow/third_party/systemlibs/jsoncpp.BUILD +tensorflow/third_party/systemlibs/grpc.BUILD +tensorflow/third_party/systemlibs/lmdb.BUILD tensorflow/third_party/systemlibs/opt_einsum.BUILD +tensorflow/third_party/systemlibs/nsync.BUILD tensorflow/third_party/systemlibs/pcre.BUILD -tensorflow/third_party/systemlibs/png.BUILD tensorflow/third_party/systemlibs/protobuf.BUILD -tensorflow/third_party/systemlibs/protobuf.bzl -tensorflow/third_party/systemlibs/re2.BUILD tensorflow/third_party/systemlibs/six.BUILD -tensorflow/third_party/systemlibs/snappy.BUILD +tensorflow/third_party/systemlibs/protobuf.bzl +tensorflow/third_party/systemlibs/png.BUILD +tensorflow/third_party/systemlibs/re2.BUILD tensorflow/third_party/systemlibs/sqlite.BUILD tensorflow/third_party/systemlibs/swig.BUILD +tensorflow/third_party/systemlibs/snappy.BUILD tensorflow/third_party/systemlibs/syslibs_configure.bzl tensorflow/third_party/systemlibs/termcolor.BUILD tensorflow/third_party/systemlibs/zlib.BUILD tensorflow/third_party/tensorrt/BUILD -tensorflow/third_party/tensorrt/LICENSE tensorflow/third_party/tensorrt/BUILD.tpl tensorflow/third_party/tensorrt/build_defs.bzl.tpl +tensorflow/third_party/tensorrt/LICENSE tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl tensorflow/third_party/tensorrt/tensorrt_configure.bzl tensorflow/third_party/termcolor.BUILD tensorflow/third_party/tflite_mobilenet.BUILD tensorflow/third_party/tflite_mobilenet_float.BUILD tensorflow/third_party/tflite_mobilenet_quant.BUILD -tensorflow/third_party/tflite_ovic_testdata.BUILD -tensorflow/third_party/tflite_smartreply.BUILD -tensorflow/third_party/toolchains/BUILD tensorflow/third_party/toolchains/clang6/BUILD tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl tensorflow/third_party/toolchains/clang6/README.md tensorflow/third_party/toolchains/clang6/clang.BUILD tensorflow/third_party/toolchains/clang6/repo.bzl -tensorflow/third_party/toolchains/cpus/arm/cc_config.bzl.tpl -tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl +tensorflow/third_party/toolchains/BUILD tensorflow/third_party/toolchains/cpus/arm/BUILD +tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl +tensorflow/third_party/toolchains/cpus/arm/cc_config.bzl.tpl tensorflow/third_party/toolchains/cpus/py/BUILD tensorflow/third_party/toolchains/cpus/py3/BUILD tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD +tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl @@ -216,8 +214,8 @@ tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl tensorflow/third_party/toolchains/preconfig/generate/BUILD -tensorflow/third_party/toolchains/preconfig/generate/containers.bzl tensorflow/third_party/toolchains/preconfig/generate/archives.bzl +tensorflow/third_party/toolchains/preconfig/generate/containers.bzl tensorflow/third_party/toolchains/preconfig/generate/generate.bzl tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD @@ -227,9 +225,9 @@ tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_too tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl +tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD @@ -242,18 +240,20 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc- tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/win_1803/BUILD tensorflow/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD +tensorflow/third_party/toolchains/preconfig/win_1803/BUILD tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD tensorflow/third_party/toolchains/remote/BUILD -tensorflow/third_party/toolchains/remote/BUILD.tpl tensorflow/third_party/toolchains/remote/configure.bzl +tensorflow/third_party/toolchains/remote/BUILD.tpl tensorflow/third_party/toolchains/remote/execution.bzl.tpl +tensorflow/third_party/tflite_ovic_testdata.BUILD +tensorflow/third_party/tflite_smartreply.BUILD tensorflow/third_party/wrapt.BUILD tensorflow/third_party/zlib.BUILD tensorflow/tools/ci_build/remote/BUILD @@ -270,9 +270,9 @@ tensorflow/tools/lib_package/libtensorflow_test.c tensorflow/tools/lib_package/libtensorflow_test.sh tensorflow/tools/pip_package/BUILD tensorflow/tools/pip_package/MANIFEST.in -tensorflow/tools/pip_package/README -tensorflow/tools/pip_package/check_load_py_test.py tensorflow/tools/pip_package/build_pip_package.sh +tensorflow/tools/pip_package/check_load_py_test.py +tensorflow/tools/pip_package/README tensorflow/tools/pip_package/pip_smoke_test.py tensorflow/tools/pip_package/setup.py tensorflow/tools/pip_package/simple_console.py