Python Binary Search File By Line

Works well for large (10s of GBs) files. I used this to parse and explore the Web Data Commons Hyperlink Graph.

import os
 
def line_binary_search(filename, matchvalue, key=lambda val: val):
    """
    Binary search a file for matching lines.
    Returns a list of matching lines.
    filename - path to file, passed to 'open'
    matchvalue - value to match
    key - function to extract comparison value from line
 
    >>> parser = lambda val: int(val.split('\t')[0].strip())
    >>> line_binary_search('sd-arc', 63889187, parser)
    ['63889187\t3592559\n', ...]
    """
 
    # Must be greater than the maximum length of any line.
 
    max_line_len = 2 ** 12
 
    start = pos = 0
    end = os.path.getsize(filename)
 
    with open(filename, 'rb') as fptr:
 
        # Limit the number of times we binary search.
 
        for rpt in xrange(50):
 
            last = pos
            pos = start + ((end - start) / 2)
            fptr.seek(pos)
 
            # Move the cursor to a newline boundary.
 
            fptr.readline()
 
            line = fptr.readline()
            linevalue = key(line)
 
            if linevalue == matchvalue or pos == last:
 
                # Seek back until we no longer have a match.
 
                while True:
                    fptr.seek(-max_line_len, 1)
                    fptr.readline()
                    if matchvalue != key(fptr.readline()):
                        break
 
               # Seek forward to the first match.
 
                for rpt in xrange(max_line_len):
                    line = fptr.readline()
                    linevalue = key(line)
                    if matchvalue == linevalue:
                        break
                else:
                    # No match was found.
 
                    return []
 
                results = []
 
                while linevalue == matchvalue:
                    results.append(line)
                    line = fptr.readline()
                    linevalue = key(line)
 
                return results
            elif linevalue < matchvalue:
                start = fptr.tell()
            else:
                assert linevalue > matchvalue
                end = fptr.tell()
        else:
            raise RuntimeError('binary search failed')
random/python_binary_search_file_by_line.txt · Last modified: 2013/12/03 05:12 by grant