From 70e25c653ab87fbc2273328e89544d4124f52065 Mon Sep 17 00:00:00 2001 From: bachmann Date: Tue, 10 Jun 2014 20:57:00 -0400 Subject: [PATCH] Add python3 support --- .travis.yml | 1 + ez_setup.py | 270 ------------------------------------------ pybloom/__init__.py | 1 - pybloom/benchmarks.py | 32 ++--- pybloom/pybloom.py | 51 ++++---- pybloom/tests.py | 58 +++++---- pybloom/utils.py | 24 ++++ setup.py | 9 +- tox.ini | 6 +- 9 files changed, 105 insertions(+), 347 deletions(-) delete mode 100644 ez_setup.py create mode 100644 pybloom/utils.py diff --git a/.travis.yml b/.travis.yml index 1c045d6..1bb8d83 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python python: - "2.6" - "2.7" + - "3.4" install: - pip install -r requirements.txt diff --git a/ez_setup.py b/ez_setup.py deleted file mode 100644 index 4848faf..0000000 --- a/ez_setup.py +++ /dev/null @@ -1,270 +0,0 @@ -#!python -"""Bootstrap setuptools installation - -If you want to use setuptools in your package's setup.py, just include this -file in the same directory with it, and add this to the top of your setup.py:: - - from ez_setup import use_setuptools - use_setuptools() - -If you want to require a specific version of setuptools, set a download -mirror, or use an alternate download directory, you can do so by supplying -the appropriate options to ``use_setuptools()``. - -This file can also be run as a script to install or upgrade setuptools. -""" -import sys -DEFAULT_VERSION = "0.6c9" -DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] - -md5_data = { - 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', - 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', - 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', - 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', - 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', - 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', - 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', - 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', - 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', - 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', - 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', - 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', - 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', - 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', - 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', - 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', - 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', - 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', - 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', - 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', - 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', - 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', - 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', - 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', - 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', - 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', - 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', - 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', - 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', - 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', - 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', - 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', - 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', - 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', -} - -import sys, os -try: from hashlib import md5 -except ImportError: from md5 import md5 - -def _validate_md5(egg_name, data): - if egg_name in md5_data: - digest = md5(data).hexdigest() - if digest != md5_data[egg_name]: - print >>sys.stderr, ( - "md5 validation of %s failed! (Possible download problem?)" - % egg_name - ) - sys.exit(2) - return data - -def use_setuptools( - version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, - download_delay=15 -): - """Automatically find/download setuptools and make it available on sys.path - - `version` should be a valid setuptools version number that is available - as an egg for download under the `download_base` URL (which should end with - a '/'). `to_dir` is the directory where setuptools will be downloaded, if - it is not already available. If `download_delay` is specified, it should - be the number of seconds that will be paused before initiating a download, - should one be required. If an older version of setuptools is installed, - this routine will print a message to ``sys.stderr`` and raise SystemExit in - an attempt to abort the calling script. - """ - was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules - def do_download(): - egg = download_setuptools(version, download_base, to_dir, download_delay) - sys.path.insert(0, egg) - import setuptools; setuptools.bootstrap_install_from = egg - try: - import pkg_resources - except ImportError: - return do_download() - try: - pkg_resources.require("setuptools>="+version); return - except pkg_resources.VersionConflict, e: - if was_imported: - print >>sys.stderr, ( - "The required version of setuptools (>=%s) is not available, and\n" - "can't be installed while this script is running. Please install\n" - " a more recent version first, using 'easy_install -U setuptools'." - "\n\n(Currently using %r)" - ) % (version, e.args[0]) - sys.exit(2) - else: - del pkg_resources, sys.modules['pkg_resources'] # reload ok - return do_download() - except pkg_resources.DistributionNotFound: - return do_download() - -def download_setuptools( - version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, - delay = 15 -): - """Download setuptools from a specified location and return its filename - - `version` should be a valid setuptools version number that is available - as an egg for download under the `download_base` URL (which should end - with a '/'). `to_dir` is the directory where the egg will be downloaded. - `delay` is the number of seconds to pause before an actual download attempt. - """ - import urllib2, shutil - egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) - url = download_base + egg_name - saveto = os.path.join(to_dir, egg_name) - src = dst = None - if not os.path.exists(saveto): # Avoid repeated downloads - try: - from distutils import log - if delay: - log.warn(""" ---------------------------------------------------------------------------- -This script requires setuptools version %s to run (even to display -help). I will attempt to download it for you (from -%s), but -you may need to enable firewall access for this script first. -I will start the download in %d seconds. - -(Note: if this machine does not have network access, please obtain the file - - %s - -and place it in this directory before rerunning this script.) ----------------------------------------------------------------------------""", - version, download_base, delay, url - ); from time import sleep; sleep(delay) - log.warn("Downloading %s", url) - src = urllib2.urlopen(url) - # Read/write all in one block, so we don't create a corrupt file - # if the download is interrupted. - data = _validate_md5(egg_name, src.read()) - dst = open(saveto,"wb"); dst.write(data) - finally: - if src: src.close() - if dst: dst.close() - return os.path.realpath(saveto) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -def main(argv, version=DEFAULT_VERSION): - """Install or upgrade setuptools and EasyInstall""" - try: - import setuptools - except ImportError: - egg = None - try: - egg = download_setuptools(version, delay=0) - sys.path.insert(0,egg) - from setuptools.command.easy_install import main - return main(list(argv)+[egg]) # we're done here - finally: - if egg and os.path.exists(egg): - os.unlink(egg) - else: - if setuptools.__version__ == '0.0.1': - print >>sys.stderr, ( - "You have an obsolete version of setuptools installed. Please\n" - "remove it from your system entirely before rerunning this script." - ) - sys.exit(2) - - req = "setuptools>="+version - import pkg_resources - try: - pkg_resources.require(req) - except pkg_resources.VersionConflict: - try: - from setuptools.command.easy_install import main - except ImportError: - from easy_install import main - main(list(argv)+[download_setuptools(delay=0)]) - sys.exit(0) # try to force an exit - else: - if argv: - from setuptools.command.easy_install import main - main(argv) - else: - print "Setuptools version",version,"or greater has been installed." - print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' - -def update_md5(filenames): - """Update our built-in md5 registry""" - - import re - - for name in filenames: - base = os.path.basename(name) - f = open(name,'rb') - md5_data[base] = md5(f.read()).hexdigest() - f.close() - - data = [" %r: %r,\n" % it for it in md5_data.items()] - data.sort() - repl = "".join(data) - - import inspect - srcfile = inspect.getsourcefile(sys.modules[__name__]) - f = open(srcfile, 'rb'); src = f.read(); f.close() - - match = re.search("\nmd5_data = {\n([^}]+)}", src) - if not match: - print >>sys.stderr, "Internal error!" - sys.exit(2) - - src = src[:match.start(1)] + repl + src[match.end(1):] - f = open(srcfile,'w') - f.write(src) - f.close() - - -if __name__=='__main__': - if len(sys.argv)>2 and sys.argv[1]=='--md5update': - update_md5(sys.argv[2:]) - else: - main(sys.argv[1:]) \ No newline at end of file diff --git a/pybloom/__init__.py b/pybloom/__init__.py index b533952..cd51525 100644 --- a/pybloom/__init__.py +++ b/pybloom/__init__.py @@ -1,4 +1,3 @@ """pybloom """ -from pybloom import BloomFilter, ScalableBloomFilter, __version__, __author__ diff --git a/pybloom/benchmarks.py b/pybloom/benchmarks.py index aa224a8..661a36b 100755 --- a/pybloom/benchmarks.py +++ b/pybloom/benchmarks.py @@ -4,44 +4,46 @@ import sys from pybloom import BloomFilter import bitarray, math, time +from utils import range_fn + def main(capacity=100000, request_error_rate=0.1): f = BloomFilter(capacity=capacity, error_rate=request_error_rate) assert (capacity == f.capacity) start = time.time() - for i in xrange(0, f.capacity): + for i in range_fn(0, f.capacity): f.add(i, skip_check=True) end = time.time() - print "{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( - end - start, f.capacity / (end - start)) + print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( + end - start, f.capacity / (end - start))) oneBits = f.bitarray.count(True) zeroBits = f.bitarray.count(False) #print "Number of 1 bits:", oneBits #print "Number of 0 bits:", zeroBits - print "Number of Filter Bits:", f.num_bits - print "Number of slices:", f.num_slices - print "Bits per slice:", f.bits_per_slice - print "------" - print "Fraction of 1 bits at capacity: {:5.3f}".format( - oneBits / float(f.num_bits)) + print("Number of Filter Bits:", f.num_bits) + print("Number of slices:", f.num_slices) + print("Bits per slice:", f.bits_per_slice) + print("------") + print("Fraction of 1 bits at capacity: {:5.3f}".format( + oneBits / float(f.num_bits))) # Look for false positives and measure the actual fp rate trials = f.capacity fp = 0 start = time.time() - for i in xrange(f.capacity, f.capacity + trials + 1): + for i in range_fn(f.capacity, f.capacity + trials + 1): if i in f: fp += 1 end = time.time() - print ("{:5.3f} seconds to check false positives, " - "{:10.2f} checks/second".format(end - start, trials / (end - start))) - print "Requested FP rate: {:2.4f}".format(request_error_rate) - print "Experimental false positive rate: {:2.4f}".format(fp / float(trials)) + print(("{:5.3f} seconds to check false positives, " + "{:10.2f} checks/second".format(end - start, trials / (end - start)))) + print("Requested FP rate: {:2.4f}".format(request_error_rate)) + print("Experimental false positive rate: {:2.4f}".format(fp / float(trials))) # Compute theoretical fp max (Goel/Gupta) k = f.num_slices m = f.num_bits n = f.capacity fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) - print "Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory) + print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)) if __name__ == '__main__' : status = main() diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py index 1ff11a5..beeefe4 100644 --- a/pybloom/pybloom.py +++ b/pybloom/pybloom.py @@ -7,7 +7,7 @@ >>> from pybloom import BloomFilter >>> f = BloomFilter(capacity=10000, error_rate=0.001) - >>> for i in xrange(0, f.capacity): + >>> for i in range_fn(0, f.capacity): ... _ = f.add(i) ... >>> 0 in f @@ -22,7 +22,7 @@ >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 - >>> for i in xrange(0, count): + >>> for i in range_fn(0, count): ... _ = sbf.add(i) ... >>> sbf.capacity > count @@ -33,11 +33,11 @@ True """ +from __future__ import absolute_import import math import hashlib +from pybloom.utils import range_fn, is_string_io, running_python_3 from struct import unpack, pack, calcsize -import StringIO -import cStringIO try: import bitarray @@ -73,12 +73,18 @@ def make_hashfuncs(num_slices, num_bits): num_salts, extra = divmod(num_slices, len(fmt)) if extra: num_salts += 1 - salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in xrange(num_salts)) + salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in range_fn(num_salts)) def _make_hashfuncs(key): - if isinstance(key, unicode): - key = key.encode('utf-8') + if running_python_3: + if isinstance(key, str): + key = key.encode('utf-8') + else: + key = str(key).encode('utf-8') else: - key = str(key) + if isinstance(key, unicode): + key = key.encode('utf-8') + else: + key = str(key) i = 0 for salt in salts: h = salt.copy() @@ -93,7 +99,7 @@ def _make_hashfuncs(key): class BloomFilter(object): - FILE_FMT = ' 0: - (filter.bitarray.frombytes(f.read(n-headerlen)) - if isinstance(f, (StringIO.StringIO, - cStringIO.InputType, - cStringIO.OutputType)) + (filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f) else filter.bitarray.fromfile(f, n - headerlen)) else: - (filter.bitarray.frombytes(f.read()) if isinstance(f, (StringIO.StringIO, - cStringIO.InputType, - cStringIO.OutputType)) + (filter.bitarray.frombytes(f.read()) if is_string_io(f) else filter.bitarray.fromfile(f)) if filter.num_bits != filter.bitarray.length() and \ (filter.num_bits + (8 - filter.num_bits % 8) != filter.bitarray.length()): - raise ValueError, 'Bit length mismatch!' + raise ValueError('Bit length mismatch!') return filter @@ -393,14 +392,14 @@ def tofile(self, f): self.initial_capacity, self.error_rate)) # Write #-of-filters - f.write(pack(' 0: # Then each filter directly, with a header describing # their lengths. headerpos = f.tell() - headerfmt = '<' + 'Q'*(len(self.filters)) - f.write('.' * calcsize(headerfmt)) + headerfmt = b'<' + b'Q'*(len(self.filters)) + f.write(b'.' * calcsize(headerfmt)) filter_sizes = [] for filter in self.filters: begin = f.tell() @@ -415,9 +414,9 @@ def fromfile(cls, f): """Deserialize the ScalableBloomFilter in file object `f'.""" filter = cls() filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) - nfilters, = unpack(' 0: - header_fmt = '<' + 'Q'*nfilters + header_fmt = b'<' + b'Q'*nfilters bytes = f.read(calcsize(header_fmt)) filter_lengths = unpack(header_fmt, bytes) for fl in filter_lengths: diff --git a/pybloom/tests.py b/pybloom/tests.py index edd97d7..13d9b7d 100644 --- a/pybloom/tests.py +++ b/pybloom/tests.py @@ -1,11 +1,17 @@ -import StringIO -import cStringIO +from __future__ import absolute_import +from pybloom.pybloom import BloomFilter, ScalableBloomFilter +from pybloom.utils import running_python_3, range_fn + +try: + from StringIO import StringIO + import cStringIO +except ImportError: + from io import BytesIO as StringIO import os import doctest import unittest import random import tempfile -from pybloom import BloomFilter, ScalableBloomFilter from unittest import TestSuite def additional_tests(): @@ -20,28 +26,28 @@ class TestUnionIntersection(unittest.TestCase): def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) - chars = [chr(i) for i in range(97, 123)] - for char in chars[len(chars)/2:]: + chars = [chr(i) for i in range_fn(97, 123)] + for char in chars[int(len(chars)/2):]: bloom_one.add(char) - for char in chars[:len(chars)/2]: + for char in chars[:int(len(chars)/2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: - self.assert_(char in new_bloom) + self.assertTrue(char in new_bloom) def test_intersection(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) - chars = [chr(i) for i in range(97, 123)] + chars = [chr(i) for i in range_fn(97, 123)] for char in chars: bloom_one.add(char) - for char in chars[:len(chars)/2]: + for char in chars[:int(len(chars)/2)]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) - for char in chars[:len(chars)/2]: - self.assert_(char in new_bloom) - for char in chars[len(chars)/2:]: - self.assert_(char not in new_bloom) + for char in chars[:int(len(chars)/2)]: + self.assertTrue(char in new_bloom) + for char in chars[int(len(chars)/2):]: + self.assertTrue(char not in new_bloom) def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) @@ -73,7 +79,7 @@ def _run(): class Serialization(unittest.TestCase): SIZE = 12345 - EXPECTED = set([random.randint(0, 10000100) for _ in xrange(SIZE)]) + EXPECTED = set([random.randint(0, 10000100) for _ in range_fn(SIZE)]) def test_serialization(self): for klass, args in [(BloomFilter, (self.SIZE,)), @@ -84,21 +90,23 @@ def test_serialization(self): f = tempfile.TemporaryFile() filter.tofile(f) - - stringio = StringIO.StringIO() - cstringio = cStringIO.StringIO() + stringio = StringIO() filter.tofile(stringio) - filter.tofile(cstringio) + streams_to_test = [f, stringio] + if not running_python_3: + cstringio = cStringIO.StringIO() + filter.tofile(cstringio) + streams_to_test.append(cstringio) + del filter - f.seek(0) - stringio.seek(0) - cstringio.seek(0) - for filter in (klass.fromfile(f), - klass.fromfile(stringio), - klass.fromfile(cstringio)): + for stream in streams_to_test: + stream.seek(0) + filter = klass.fromfile(stream) for item in self.EXPECTED: - self.assert_(item in filter) + self.assertTrue(item in filter) + del(filter) + stream.close() if __name__ == '__main__': unittest.main() diff --git a/pybloom/utils.py b/pybloom/utils.py new file mode 100644 index 0000000..535d77b --- /dev/null +++ b/pybloom/utils.py @@ -0,0 +1,24 @@ +import sys +try: + import StringIO + import cStringIO +except ImportError: + from io import BytesIO + +running_python_3 = sys.version_info[0] == 3 + + +def range_fn(*args): + if running_python_3: + return range(*args) + else: + return xrange(*args) + + +def is_string_io(instance): + if running_python_3: + return isinstance(instance, BytesIO) + else: + return isinstance(instance, (StringIO.StringIO, + cStringIO.InputType, + cStringIO.OutputType)) \ No newline at end of file diff --git a/setup.py b/setup.py index e9655a3..a49cf2f 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,5 @@ #!/usr/bin/env python -from ez_setup import use_setuptools -use_setuptools() - -import os - -from setuptools import setup, find_packages, Extension +from setuptools import setup VERSION = '2.0.0' DESCRIPTION = "PyBloom: A Probabilistic data structure" @@ -19,6 +14,7 @@ Intended Audience :: Developers License :: OSI Approved :: MIT License Programming Language :: Python +Programming Language :: Python :: 3", Operating System :: OS Independent Topic :: Utilities Topic :: Database :: Database Engines/Servers @@ -37,7 +33,6 @@ author_email="jay.baird@me.com", url="http://github.com/jaybaird/python-bloomfilter/", license="MIT License", - packages=find_packages(exclude=['ez_setup']), platforms=['any'], test_suite="pybloom.tests", zip_safe=True, diff --git a/tox.ini b/tox.ini index a8929fa..6d9848f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26,py27 +envlist = py26,py27,py34 [testenv] -deps=nose -commands=nosetests +deps=pytest +commands=py.test pybloom/tests.py