From a0d8c18b49ddfefd2dd79386c6138720ac1fd8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Nizio?= <slawomir.nizio@sabayon.org>
Date: Wed, 14 Nov 2018 09:10:34 +0100
Subject: [PATCH] [entropy.spm] speed up xpak finding

Now it's chunk based, so fewer reads and seeks, and searching itself
is improved as well.
---
 .../interfaces/portage_plugin/xpaktools.py    | 73 ++++++++++---------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/lib/entropy/spm/plugins/interfaces/portage_plugin/xpaktools.py b/lib/entropy/spm/plugins/interfaces/portage_plugin/xpaktools.py
index 399b70303..6da07d422 100644
--- a/lib/entropy/spm/plugins/interfaces/portage_plugin/xpaktools.py
+++ b/lib/entropy/spm/plugins/interfaces/portage_plugin/xpaktools.py
@@ -2,6 +2,7 @@
 """
 
     @author: Fabio Erculiani <lxnay@sabayon.org>
+    @author: Slawomir Nizio <slawomir.nizio@sabayon.org>
     @contact: lxnay@sabayon.org
     @copyright: Fabio Erculiani
     @license: GPL-2
@@ -107,52 +108,58 @@ def suck_xpak(tbz2file, xpakpath):
     @return: 
     @rtype: 
     """
+    if const_is_python3():
+        xpak_end = b"XPAKSTOP"
+        xpak_start = b"XPAKPACK"
+    else:
+        xpak_end = "XPAKSTOP"
+        xpak_start = "XPAKPACK"
+
+    chunk_size = 2048
+
+    # Sanity check: makes the position calculations easier (seek_length below).
+    assert len(xpak_end) == len(xpak_start)
+
     old, db = None, None
     try:
         old = open(tbz2file, "rb")
         db = open(xpakpath, "wb")
-        # position old to the end
-        old.seek(0, os.SEEK_END)
-        # read backward until we find
-        n_bytes = old.tell()
-        counter = n_bytes - 1
-        if const_is_python3():
-            xpak_end = b"XPAKSTOP"
-            xpak_start = b"XPAKPACK"
-            xpak_entry_point = b"X"
-        else:
-            xpak_end = "XPAKSTOP"
-            xpak_start = "XPAKPACK"
-            xpak_entry_point = "X"
-
-        xpak_tag_len = len(xpak_start)
-        chunk_len = 3
         data_start_position = None
         data_end_position = None
+        # position old to the end
+        old.seek(0, os.SEEK_END)
+        n_bytes = old.tell()
 
-        while counter >= (0 - chunk_len):
+        chunk_size = min(chunk_size, n_bytes)
 
-            old.seek(counter - n_bytes, os.SEEK_END)
-            if (n_bytes - (abs(counter - n_bytes))) < chunk_len:
-                chunk_len = 1
-            read_bytes = old.read(chunk_len)
-            read_len = len(read_bytes)
+        # position one chunk from the end, then continue
+        seek_pos = n_bytes - chunk_size
 
-            entry_idx = read_bytes.rfind(xpak_entry_point)
-            if entry_idx != -1:
+        while True:
+            old.seek(seek_pos, os.SEEK_SET)
+            read_bytes = old.read(chunk_size)
 
-                cut_gotten = read_bytes[entry_idx:]
-                offset = xpak_tag_len - len(cut_gotten)
-                chunk = cut_gotten + old.read(offset)
+            end_idx = read_bytes.rfind(xpak_end)
+            if end_idx != -1:
+                if data_start_position is None:
+                    data_end_position = seek_pos + end_idx + len(xpak_end)
+                    # avoid START after END in rfind()
+                    read_bytes = read_bytes[:end_idx]
 
-                if (chunk == xpak_end) and (data_start_position is None):
-                    data_end_position = old.tell()
-
-                elif (chunk == xpak_start) and (data_end_position is not None):
-                    data_start_position = old.tell() - xpak_tag_len
+            start_idx = read_bytes.rfind(xpak_start)
+            if start_idx != -1:
+                if data_end_position is not None:
+                    data_start_position = seek_pos + start_idx
                     break
 
-            counter -= read_len
+            if seek_pos == 0:
+                break
+
+            # Make sure the seeks are so that there is enough overlap.
+            seek_length = chunk_size - (len(xpak_start) - 1)
+            seek_pos -= seek_length
+            if seek_pos < 0:
+                seek_pos = 0
 
         if data_start_position is None:
             return False