[arch-dev-public] [PATCH 7/7] Make the depends module not suck
Dan McGee
dan at archlinux.org
Mon Sep 28 00:16:26 EDT 2009
Namcap being slow pissed me off enough that I wanted to figure out what
could be taking so long. As is usual with software, 1 module was causing
about 90% of the slowdown. Compare the before and after with this patch:
$ time namcap -m -r depends {4 packages} >/dev/null
real 0m18.001s
user 0m9.313s
sys 0m7.966s
$ time ./namcap.py -m -r depends {4 packages }>/dev/null
real 0m4.512s
user 0m3.770s
sys 0m0.740s
This patch addresses a few issue noticed when crawling and profiling the code
using cProfile. First, we were doing O(n*m) string compares of files in the
local database with libraries in the package. We can use a few tricks to cut
down the number of checks- namely, once a library has been found we don't need
to look for it anymore, and we can also ignore any non-'.so' files.
Another part where we were really shooting ourselves in the foot was invoking
readelf on *every single file* in the package. Cut that down so we only call
readelf on files with the ELF magic bytes, which gives us a dramatic savings as
we no longer fork for files in /usr/share, for instance.
Bonus material: script checking left out perl, and script dependencies were
formerly listed as link-level dependencies as well. Both of these have been
fixed.
The sad part is there is still room to improve here, but I decided enough
was enough for now. :)
Signed-off-by: Dan McGee <dan at archlinux.org>
---
Namcap/depends.py | 104 ++++++++++++++++++++++++++++++++---------------------
namcap-tags | 2 +-
2 files changed, 64 insertions(+), 42 deletions(-)
diff --git a/Namcap/depends.py b/Namcap/depends.py
index e851e5c..4a3f733 100644
--- a/Namcap/depends.py
+++ b/Namcap/depends.py
@@ -19,7 +19,7 @@
import re, os, os.path, pacman, subprocess
-supported_scripts = ['python', 'ruby', 'wish', 'expect',
+supported_scripts = ['python', 'perl', 'ruby', 'wish', 'expect',
'tk', 'bash', 'sh', 'dash', 'tcsh', 'pdksh' ]
pkgcache = {}
@@ -50,7 +50,8 @@ def getcovered(current, dependlist, covereddepend):
def figurebitsize(line):
"""
- Given a line of output from readelf (usually Shared library:) return 'i686' or 'x86-64' if the binary is a 32bit or 64bit binary
+ Given a line of output from readelf (usually Shared library:) return
+ 'i686' or 'x86-64' if the binary is a 32bit or 64bit binary
"""
address = line.split()[0]
@@ -59,6 +60,22 @@ def figurebitsize(line):
else:
return 'i686'
+def is_elf(path):
+ """
+ Given a file path, ensure it exists and peek at the first few bytes
+ to determine if it is an ELF file.
+ """
+ if not os.path.isfile(path):
+ return False
+ fd = open(path)
+ bytes = fd.read(4)
+ fd.close()
+ # magic elf header, present in binaries and libraries
+ if bytes == "\x7FELF":
+ return True
+ else:
+ return False
+
def scanlibs(data, dirname, names):
"""
Walk over all the files in the package and run "readelf -d" on them.
@@ -66,41 +83,39 @@ def scanlibs(data, dirname, names):
"""
sharedlibs, scripts = data
global supported_scripts
+ shared = re.compile('Shared library: \[(.*)\]')
+ script = re.compile('#!.*/(.*)')
for i in names:
- if os.path.isfile(dirname+'/'+i):
- var = subprocess.Popen('readelf -d ' + dirname+'/'+i,
+ path = dirname + '/' + i
+ if is_elf(path):
+ var = subprocess.Popen('readelf -d ' + path,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
for j in var[0].split('\n'):
- n = re.search('Shared library: \[(.*)\]', j)
+ n = shared.search(j)
# Is this a Shared library: line?
if n != None:
# Find out its architecture
architecture = figurebitsize(j)
try:
libpath = os.path.abspath(libcache[architecture][n.group(1)])[1:]
- sharedlibs.setdefault(libpath, {})[dirname+'/'+i] = 1
+ sharedlibs.setdefault(libpath, {})[path] = 1
except KeyError:
# Ignore that library if we can't find it
# TODO: review it
pass
- # But we can check to see if it's a script we know about
- else:
- try:
- fd = open(dirname+'/'+i)
- except:
- continue
- firstline = fd.readline()
- firstlinere = re.match('#!.*/(.*)', firstline)
- if firstlinere != None:
- cmdname = firstlinere.group(1).split()[0]
- if cmdname in supported_scripts:
- scripts.setdefault(cmdname, {})[dirname+'/'+i] = 1
-
- fd.close()
- return
-
+ # Maybe it is a script file
+ elif os.path.isfile(path):
+ fd = open(path)
+ firstline = fd.readline()
+ fd.close()
+ firstlinere = script.match(firstline)
+ if firstlinere != None:
+ cmdname = firstlinere.group(1).split()[0]
+ if cmdname in supported_scripts:
+ scripts.setdefault(cmdname, {})[path] = 1
+
def finddepends(liblist):
dependlist = {}
foundlist = []
@@ -108,24 +123,28 @@ def finddepends(liblist):
somatches = {}
actualpath = {}
- for j in liblist.keys():
+ knownlibs = liblist.keys()
+
+ for j in knownlibs:
actualpath[j] = os.path.realpath('/'+j)[1:]
# Sometimes packages don't include all so .so, .so.1, .so.1.13, .so.1.13.19 files
# They rely on ldconfig to create all the symlinks
# So we will strip off the matching part of the files and use this regexp to match the rest
so_end = re.compile('(\.\d+)*')
+ # Whether we should even look at a particular file
+ is_so = re.compile('\.so')
pacmandb = '/var/lib/pacman/local'
for i in os.listdir(pacmandb):
if os.path.isfile(pacmandb+'/'+i+'/files'):
file = open(pacmandb+'/'+i+'/files')
for j in file.readlines():
- if j[len(j)-1:]=='\n':
- j = j[:len(j)-1]
+ if not is_so.search(j):
+ continue
- for k in liblist.keys():
- # If the file is an exact match, so it's a match up to a point and everything after that point matches a the regexp
+ for k in knownlibs:
+ # File must be an exact match or have the right .so ending numbers
# i.e. gpm includes libgpm.so and libgpm.so.1.19.0, but everything links to libgpm.so.1
# We compare find libgpm.so.1.19.0 startswith libgpm.so.1 and .19.0 matches the regexp
if j == actualpath[k] or (j.startswith(actualpath[k]) and so_end.match(j[len(actualpath[k]):])):
@@ -134,13 +153,14 @@ def finddepends(liblist):
dependlist[n.group(1)] = {}
for x in liblist[k]:
dependlist[n.group(1)][x] = 1
+ knownlibs.remove(k)
foundlist.append(k)
file.close()
ret = []
- for i in liblist.keys():
- if i not in foundlist:
- ret.append('Library ' + i + ' has no package associated')
+ # knownlibs only contains what hasn't been found at this point
+ for i in knownlibs:
+ ret.append('Library ' + i + ' has no package associated')
return dependlist, ret
def getprovides(depends, provides):
@@ -155,8 +175,9 @@ def filllibcache():
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
+ libline = re.compile('\s*(.*) \((.*)\) => (.*)')
for j in var[0].split('\n'):
- g = re.match('\s*(.*) \((.*)\) => (.*)',j)
+ g = libline.match(j)
if g != None:
if g.group(2).startswith('libc6,x86-64'):
libcache['x86-64'][g.group(1)] = g.group(3)
@@ -190,6 +211,16 @@ class package:
for i in tmpret:
ret[1].append(i)
+ # Remove the package name from that list, we can't depend on ourselves.
+ if dependlist.has_key(pkginfo.name):
+ del dependlist[pkginfo.name]
+
+ # Print link-level deps
+ for i, v in dependlist.iteritems():
+ if type(v) == dict:
+ files = [x[len(data)+1:] for x in v.keys()]
+ ret[2].append(("link-level-dependence %s in %s", (i, str(files))))
+
# Do the script handling stuff
for i, v in liblist[1].iteritems():
if not dependlist.has_key(i):
@@ -199,16 +230,6 @@ class package:
files = [x[len(data)+1:] for x in v.keys()]
ret[2].append(("script-link-detected %s in %s", (i, str(files))))
- # Remove the package name from that list, we can't depend on ourselves.
- if dependlist.has_key(pkginfo.name):
- del dependlist[pkginfo.name]
-
- # Do the info stuff
- for i, v in dependlist.iteritems():
- if type(v) == dict:
- files = [x[len(data)+1:] for x in v.keys()]
- ret[2].append(("link-level-dependence %s on %s", (str(files), i)))
-
# Check for packages in testing
if os.path.isdir('/var/lib/pacman/sync/testing'):
for i in dependlist.keys():
@@ -267,4 +288,5 @@ class package:
return ret
def type(self):
return "tarball"
+
# vim: set ts=4 sw=4 noet:
diff --git a/namcap-tags b/namcap-tags
index c6f170e..710077c 100644
--- a/namcap-tags
+++ b/namcap-tags
@@ -35,7 +35,7 @@ incorrect-permissions %s (%s/%s) :: File (%s) has %s/%s permissions.
info-dir-file-present %s :: Info directory file (%s) needs removed.
insecure-rpath %s :: Insecure RPATH (%s). If present, RPATH should be only /usr/lib.
libtool-file-present %s :: File (%s) is a libtool file.
-link-level-dependence %s on %s :: File '%s' link-level dependence on %s
+link-level-dependence %s in %s :: Link-level dependence (%s) in file %s
mime-cache-not-updated :: Mime-file found. Add "update-mime-database usr/share/mime" to the install file
missing-contributor :: Missing Contributor tag
missing-custom-license-dir usr/share/licenses/%s :: Missing custom license directory (usr/share/licenses/%s)
--
1.6.4.4
More information about the arch-dev-public
mailing list