#! /usr/bin/env python # # Baidu Blog and Forum Timed Corpus sample script (word temporal statistics) # # Copyright (c) 2010 Baidu Japan, Inc. # URL: http://www.baidu.jp/corpus/ # # Usage: python bbftf_sample.py ... # import sys, codecs sys.stdout = codecs.getwriter('utf_8')(sys.stdout) corpusdir = sys.argv[1] if len(sys.argv)>1 else "./" query = ' '.join(sys.argv[2:]).decode('utf-8') if len(sys.argv)>2 else u"\u85C1" gram = len(query.split(' ')) # maximum scales -> 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, ... scales = [r*f for r in [10**x for x in xrange(-3, 2)] for f in [1,2,5]] # screen width = 80 WIDTH = 80 GRAPHWIDTH = WIDTH-8-2-8 # WIDTH-len("YYYY-MM ")-2*len("|")-len("00.000 %") print 'Word temporal statistics of: "%s"' % query res = [] print "Reading %dgm ..." % gram, for year in xrange(2000, 2011): print "%d ..." % year, sys.stdout.flush() for month in xrange(1, 13): if year == 2010 and month == 8: break ymstr = "%d-%02d" % (year, month) # read 1gm file query_freq = 0 total_freq = 0 for line in open("%s/%s.%dgm" % (corpusdir, ymstr, gram)): w, c_str = line.decode('utf-8').split('\t') c = int(c_str[:-1]) total_freq += c if query == w: query_freq = c percent = query_freq * 100.0 / total_freq res.append( [ymstr, percent] ) # find the maximum fitting scale max_percent = max(r[1] for r in res) scale = min(s for s in scales if s > max_percent) # draw the axis print "" print " | 0 %% %s % 7.3f %% |" % (" " * (GRAPHWIDTH - len(" 0 % 00.000 % ")), scale) # draw the graph for r in res: width = int(GRAPHWIDTH * r[1] / scale) print "%s |%s%s| % 7.3f %%" % (r[0], "*"*width, " "*(GRAPHWIDTH-width), r[1] )