Python Binary Search File By Line
Works well for large (10s of GBs) files. I used this to parse and explore the Web Data Commons Hyperlink Graph.
import os def line_binary_search(filename, matchvalue, key=lambda val: val): """ Binary search a file for matching lines. Returns a list of matching lines. filename - path to file, passed to 'open' matchvalue - value to match key - function to extract comparison value from line >>> parser = lambda val: int(val.split('\t')[0].strip()) >>> line_binary_search('sd-arc', 63889187, parser) ['63889187\t3592559\n', ...] """ # Must be greater than the maximum length of any line. max_line_len = 2 ** 12 start = pos = 0 end = os.path.getsize(filename) with open(filename, 'rb') as fptr: # Limit the number of times we binary search. for rpt in xrange(50): last = pos pos = start + ((end - start) / 2) fptr.seek(pos) # Move the cursor to a newline boundary. fptr.readline() line = fptr.readline() linevalue = key(line) if linevalue == matchvalue or pos == last: # Seek back until we no longer have a match. while True: fptr.seek(-max_line_len, 1) fptr.readline() if matchvalue != key(fptr.readline()): break # Seek forward to the first match. for rpt in xrange(max_line_len): line = fptr.readline() linevalue = key(line) if matchvalue == linevalue: break else: # No match was found. return [] results = [] while linevalue == matchvalue: results.append(line) line = fptr.readline() linevalue = key(line) return results elif linevalue < matchvalue: start = fptr.tell() else: assert linevalue > matchvalue end = fptr.tell() else: raise RuntimeError('binary search failed')