[entropy.spm] improve handling of non-ascii paths (e.g. puppet-agent).

It looks like Portage now stores unicode paths correctly in its metadata
as opposed to what it used to be. We need to make sure that we parse those
"CONTENTS" file and content metadata in general using the correct encoding.
This will allow us to store and retrieve such metadata from the sqlite3
database correctly and also match the stored paths with the filesystem
paths exactly.

This commit may need a bit more real-life testing. Backward compat
wrt old Entropy and Portage tbz2 files should be as expected.
Unit tests attached.
This commit is contained in:
Fabio Erculiani
2018-09-23 22:06:05 +02:00
parent 34c549feb8
commit e9b211eea0
5 changed files with 69 additions and 22 deletions

View File

@@ -1410,7 +1410,11 @@ class PortagePlugin(SpmPlugin):
data['datecreation'] = str(os.path.getmtime(package_file))
data['size'] = str(entropy.tools.get_file_size(package_file))
tmp_dir = const_mkdtemp(prefix="entropy.spm._extract")
# This allows os.* functions on Python2 to use unicode, correctly.
# See the issues with puppet-agent (unit tests in db.py).
tmp_dir = const_convert_to_unicode(
const_mkdtemp(prefix="entropy.spm._extract"),
enctype = sys.getfilesystemencoding())
meta_dir = os.path.join(tmp_dir, "portage")
pkg_dir = os.path.join(tmp_dir, "pkg")
os.mkdir(meta_dir)
@@ -3033,15 +3037,15 @@ class PortagePlugin(SpmPlugin):
from portage.dbapi.vartree import write_contents
entropy_content_iter = entropy_package_metadata['content']
sys_root = const_convert_to_rawstring(etpConst['systemroot'])
# Make sure that we use the fs encoding. This works with both
# old and new Entropy packages.
sys_root = const_convert_to_unicode(
etpConst['systemroot'], enctype=sys.getfilesystemencoding())
content_meta = {}
try:
for _package_id, _path, _ftype in entropy_content_iter:
_ftype = const_convert_to_rawstring(_ftype)
path_orig = const_convert_to_rawstring(_path)
path = sys_root + path_orig
for _package_id, path, _ftype in entropy_content_iter:
path = sys_root + path
is_sym = os.path.islink(path)
if os.path.isfile(path) and not is_sym:
@@ -4648,22 +4652,28 @@ class PortagePlugin(SpmPlugin):
if os.path.isfile(content_file):
with open(content_file, "rb") as f:
content = [const_convert_to_unicode(x) for x in f.readlines()]
for line in f.readlines():
try:
# Modern Entropy/Portage correctly use unicode.
line = const_convert_to_unicode(
line, enctype=sys.getfilesystemencoding())
except UnicodeDecodeError:
# Support for very ancient Entropy or Portage packages.
line = const_convert_to_unicode(
line, enctype=etpConst['conf_raw_encoding'])
outcontent = set()
for line in content:
line = line.strip().split()
try:
line = line.strip().split(" ")
datatype = line[0]
datafile = line[1:]
if datatype == obj_t:
datafile = datafile[:-2]
datafile = ' '.join(datafile)
datafile = " ".join(datafile)
elif datatype in (dir_t, fif_t, dev_t):
datafile = ' '.join(datafile)
datafile = " ".join(datafile)
elif datatype == sym_t:
datafile = datafile[:-3]
datafile = ' '.join(datafile)
datafile = " ".join(datafile)
else:
myexc = "%s %s. %s." % (
datafile,
@@ -4675,19 +4685,22 @@ class PortagePlugin(SpmPlugin):
warnings.warn(
"Empty file path detected, skipping!")
continue
outcontent.add((datafile, datatype))
except:
pass
outcontent = sorted(outcontent)
for datafile, datatype in outcontent:
pkg_content[datafile] = datatype
pkg_content[datafile] = datatype
else:
# CONTENTS is not generated when a package is emerged with
# portage and the option -B
# we have to use the unpacked package file and generate content dict
try:
# Modern Entropy/Portage correctly use unicode.
pkg_dir = const_convert_to_unicode(
pkg_dir, enctype=sys.getfilesystemencoding())
except UnicodeDecodeError:
# Support for very ancient Entropy or Portage packages.
pkg_dir = const_convert_to_unicode(
pkg_dir, enctype=etpConst['conf_raw_encoding'])
tmpdir_len = len(pkg_dir)
for currentdir, subdirs, files in os.walk(pkg_dir):
cur_dir = currentdir[tmpdir_len:]

View File

@@ -19,6 +19,14 @@ def get_test_generic_package(test_pkg):
path = _get_test_generic_package_path(test_pkg)
return path
def get_test_package_ca_certs():
test_pkg = "ca-certificates-20180409.3.37.tbz2"
return get_test_generic_package(test_pkg)
def get_test_package_puppet_agent():
test_pkg = "puppet-agent-6.0.0.tbz2"
return get_test_generic_package(test_pkg)
def get_test_package():
test_pkg = "zlib-1.2.3-r1.tbz2"
return get_test_generic_package(test_pkg)

View File

@@ -423,6 +423,32 @@ class EntropyRepositoryTest(unittest.TestCase):
self.assertTrue(isinstance(results, set))
self.assertTrue(rc == 1)
def test_db_handle_unicode_puppet_agent(self):
test_pkg = _misc.get_test_package_puppet_agent()
data = self.Spm.extract_package_metadata(test_pkg)
idpackage = self.test_db.addPackage(data)
db_data = self.test_db.getPackageData(idpackage)
_misc.clean_pkg_metadata(db_data)
_misc.clean_pkg_metadata(data)
self.assertEqual(data, db_data)
self.test_db.removePackage(idpackage)
def test_db_handle_unicode_ca_certs(self):
test_pkg = _misc.get_test_package_ca_certs()
data = self.Spm.extract_package_metadata(test_pkg)
idpackage = self.test_db.addPackage(data)
db_data = self.test_db.getPackageData(idpackage)
_misc.clean_pkg_metadata(db_data)
_misc.clean_pkg_metadata(data)
self.assertEqual(data, db_data)
self.test_db.removePackage(idpackage)
def test_db_insert_compare_match_utf(self):
# insert/compare

Binary file not shown.

Binary file not shown.