Fix GetMatchingPaths with special chars bug.

After 5659465166, `GetMatchingPaths` was converted to use RE2 instead of `fnmatch` as that allows non-local filesystems (e.g., GCS, Hadoop, S3) to also be used from Windows. However, this breaks compatibility between `tf.io.gfile.glob` and Python `glob` and that results in tests silently failing or examples being silently skipped during training.

The fix is two-pronged. First, to fix  only, we add regexp replacements for `(` and `)` in the pattern, escaping them before matching. After testing and seeing that this works, we then re-enable `fnmatch` on POSIX environments to reduce binary size, just like we did for mobile platforms.

Fixes  (everywhere) and  (on posix platforms).

Tested via `bazel run //tensorflow/python:file_io_test` after adding a test for the pattern in .

Will need to be cherry-picked onto `r2.2` branch.

PiperOrigin-RevId: 303009914
Change-Id: Ieab047f63e9ba6bb0ec0499e0fa864f6ca6090ff
This commit is contained in:
Mihai Maruseac 2020-03-25 17:46:09 -07:00 committed by TensorFlower Gardener
parent bfc51b2d23
commit 56f4edbb5a
2 changed files with 26 additions and 10 deletions
tensorflow
core/platform
python/lib/io

View File

@ -16,9 +16,6 @@ limitations under the License.
#include "tensorflow/core/platform/file_system.h"
#include <sys/stat.h>
#if defined(IS_MOBILE_PLATFORM)
#include <fnmatch.h>
#endif
#include <algorithm>
#include <deque>
@ -26,12 +23,15 @@ limitations under the License.
#include <utility>
#include <vector>
#if defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM)
#include <fnmatch.h>
#else
#include "tensorflow/core/platform/regexp.h"
#endif // defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM)
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/platform.h"
#if !defined(IS_MOBILE_PLATFORM)
#include "tensorflow/core/platform/regexp.h"
#endif
#include "tensorflow/core/platform/scanner.h"
#include "tensorflow/core/platform/str_util.h"
#include "tensorflow/core/platform/strcat.h"
@ -39,16 +39,20 @@ limitations under the License.
namespace tensorflow {
bool FileSystem::Match(const string& filename, const string& pattern) {
#if defined(IS_MOBILE_PLATFORM)
#if defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM)
// We avoid relying on RE2 on mobile platforms, because it incurs a
// significant binary size increase.
// For POSIX platforms, there is no need to depend on RE2 if `fnmatch` can be
// used safely.
return fnmatch(pattern.c_str(), filename.c_str(), FNM_PATHNAME) == 0;
#else
string regexp(pattern);
RE2::GlobalReplace(&regexp, "\\*", "[^/]*");
RE2::GlobalReplace(&regexp, "\\?", ".");
regexp = str_util::StringReplace(regexp, "*", "[^/]*", true);
regexp = str_util::StringReplace(regexp, "?", ".", true);
regexp = str_util::StringReplace(regexp, "(", "\\(", true);
regexp = str_util::StringReplace(regexp, ")", "\\)", true);
return RE2::FullMatch(filename, regexp);
#endif
#endif // defined(PLATFORM_POSIX) || defined(IS_MOBILE_PLATFORM)
}
string FileSystem::TranslateName(const string& name) const {

View File

@ -159,6 +159,18 @@ class FileIoTest(test.TestCase):
file_io.delete_recursively(dir_path)
self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
def testGetMatchingFilesWhenParentDirContainsParantheses(self):
dir_path = os.path.join(self._base_dir, "dir_(special)")
file_io.create_dir(dir_path)
files = ["file1.txt", "file(2).txt"]
for name in files:
file_path = os.path.join(dir_path, name)
file_io.FileIO(file_path, mode="w").write("testing")
expected_match = [os.path.join(dir_path, name) for name in files]
glob_pattern = os.path.join(dir_path, "*")
self.assertItemsEqual(
file_io.get_matching_files(glob_pattern), expected_match)
def testCreateRecursiveDir(self):
dir_path = os.path.join(self._base_dir, "temp_dir/temp_dir1/temp_dir2")
file_io.recursive_create_dir(dir_path)