omnipaste

changeset 106:ffd751a09aee

Complete fulltext search for irc posts near each other.
author Chouser <chouser@bluweb.com>
date Thu May 17 02:03:12 2007 -0400 (5 years ago)
parents 271c67c45c73
children 3fd0c6790170
files _log.cgi _style.css ircsrp.kid
line diff
1.1 --- a/_log.cgi Wed May 16 23:23:34 2007 -0400 1.2 +++ b/_log.cgi Thu May 17 02:03:12 2007 -0400 1.3 @@ -172,9 +172,10 @@ 1.4 1.5 class LogIter( object ): 1.6 __slots__ = [ 'dictcursor', 'result', 'dateline', 'lastdate', 'odd', 1.7 - 'startoffset', 'breaksum', 'runningsum' ] 1.8 + 'startoffset', 'breaksum', 'runningsum', 're_qwords' ] 1.9 1.10 - def __init__( self, dictcursor, startoffset = None, breaksum = None ): 1.11 + def __init__( self, dictcursor, startoffset = None, breaksum = None, 1.12 + qwords = None ): 1.13 self.dictcursor = dictcursor 1.14 self.dateline = None 1.15 self.lastdate = datetime.datetime( 1980, 1, 1 ) 1.16 @@ -182,6 +183,10 @@ 1.17 self.startoffset = startoffset 1.18 self.breaksum = breaksum 1.19 self.runningsum = sha.new() 1.20 + if qwords: 1.21 + self.re_qwords = re.compile( r'\b(%s)\b' % '|'.join( qwords ), re.I ) 1.22 + else: 1.23 + self.re_qwords = None 1.24 1.25 def timestamp( self ): 1.26 return self.result['whenCreated'] 1.27 @@ -216,6 +221,9 @@ 1.28 text = re.sub('<', '&lt;', text ) 1.29 text = re.sub('>', '&gt;', text ) 1.30 1.31 + if self.re_qwords: 1.32 + text = self.re_qwords.sub( '<span class="q">\g<1></span>', text ) 1.33 + 1.34 text = re.sub('\n', '<br />', text ) 1.35 1.36 text = re.sub('\003(\d\d?)(.*?)(?=\003|\017|$)', colorize, text ) 1.37 @@ -278,11 +286,68 @@ 1.38 if self.breaksum and self.breaksum == self.runningsum.hexdigest(): 1.39 break 1.40 1.41 + def simpleiter( self ): 1.42 + self.odd = True 1.43 + for self.result in self.dictcursor: 1.44 + yield self 1.45 + 1.46 def sum( self ): 1.47 for tmp in self.logentries(): 1.48 pass 1.49 return self.runningsum.hexdigest() 1.50 1.51 +class PostGroup( object ): 1.52 + __slots__ = [ 'min', 'max', 'score', 'istartdate', 'istartcode', 'iendcode', 'qwords' ] 1.53 + 1.54 + def __init__( self ): 1.55 + self.score = 0 1.56 + self.istartdate = None 1.57 + self.istartcode = None 1.58 + self.iendcode = None 1.59 + 1.60 + def __repr__( self ): 1.61 + return '<PostGroup %d-%d, %0.4f>' % (self.min, self.max, self.score) 1.62 + 1.63 + def fetchdates( self ): 1.64 + cursor = opcommon.dbconn.cursor 1.65 + cursor.execute(''' 1.66 + SELECT whenCreated 1.67 + FROM irclog 1.68 + WHERE id = %d 1.69 + OR id = %d 1.70 + ORDER BY id''' % 1.71 + ( self.min, self.max )) 1.72 + self.istartdate = cursor.fetchone()[0] 1.73 + self.istartcode = datetime2str( self.istartdate ) 1.74 + self.iendcode = datetime2str( cursor.fetchone()[0] ) 1.75 + 1.76 + def startdate( self ): 1.77 + if not self.istartdate: 1.78 + self.fetchdates() 1.79 + return self.istartdate 1.80 + startdate = property( startdate ) 1.81 + 1.82 + def startcode( self ): 1.83 + if not self.istartcode: 1.84 + self.fetchdates() 1.85 + return self.istartcode 1.86 + startcode = property( startcode ) 1.87 + 1.88 + def endcode( self ): 1.89 + if not self.iendcode: 1.90 + self.fetchdates() 1.91 + return self.iendcode 1.92 + endcode = property( endcode ) 1.93 + 1.94 + def posts( self ): 1.95 + cursor = opcommon.dbconn.dictcursor 1.96 + cursor.execute(''' 1.97 + SELECT * 1.98 + FROM irclog 1.99 + WHERE id BETWEEN %d AND %d''' % 1.100 + ( self.min, self.max )) 1.101 + return LogIter( cursor, qwords = self.qwords ).simpleiter() 1.102 + 1.103 class Handler(cgi.Handler): 1.104 def forcelogin(self, req, msg="Error: You must be logged in to view the log\n"): 1.105 req.set_header( 1.106 @@ -320,22 +385,78 @@ 1.107 tmpl.topics = LogIter( cursor ).logentries() 1.108 req.set_header('Content-Type', 'text/html; charset=UTF-8') 1.109 req.write( tmpl.serialize( output=serializer ) ) 1.110 + 1.111 elif 'q' in req.params: 1.112 if not authinfo: 1.113 self.forcelogin( req ) 1.114 return 1.115 1.116 - cursor = opcommon.dbconn.dictcursor 1.117 + cursor = opcommon.dbconn.cursor 1.118 tmpl = kid.Template( file='ircsrp.kid' ) 1.119 tmpl.homeurl = opcommon.opconf.ircurl 1.120 tmpl.q = req.params.get('q') 1.121 + qwords = req.params.get('q').split() 1.122 1.123 - cursor.execute('SELECT * FROM irclog WHERE MATCH( text ) AGAINST ( %s )', 1.124 - ( req.params.get('q') ) ) 1.125 - tmpl.logentries = LogIter( cursor ).logentries() 1.126 + # get raw number of matching words per post id 1.127 + scorehash = {} 1.128 + datehash = {} 1.129 + for word in qwords: 1.130 + cursor.execute(''' 1.131 + SELECT id, whenCreated 1.132 + FROM irclog 1.133 + WHERE MATCH( text ) AGAINST ( %s ) 1.134 + ORDER BY id''', word ) 1.135 + 1.136 + for tup in cursor: 1.137 + id = tup[0] 1.138 + scorehash[ id ] = 1 + scorehash.get( id, 0 ) 1.139 + datehash[ id ] = tup[1] 1.140 + 1.141 + idlist = scorehash.keys() 1.142 + idlist.sort() 1.143 + 1.144 + # increase the score of posts that have other high-scoring posts nearby 1.145 + window = 5 1.146 + power = 1.2 1.147 + 1.148 + outrighti = 0 1.149 + for leftid in idlist: 1.150 + for righti in xrange( outrighti, len( idlist ) ): 1.151 + rightid = idlist[ righti ] 1.152 + if leftid - window > rightid: 1.153 + # righti is still too low 1.154 + outrighti = righti 1.155 + elif leftid + window < rightid: 1.156 + # righti is now too high 1.157 + break 1.158 + elif leftid == rightid: 1.159 + # multiple words on same id have already been counted 1.160 + pass 1.161 + else: 1.162 + factor = 1.0 / abs( leftid - rightid ) ** power 1.163 + scorehash[ leftid ] += float( scorehash[ rightid ] ) * factor 1.164 + 1.165 + # build and score groups of posts 1.166 + groups = [] 1.167 + thisgroup = None 1.168 + for id in idlist: 1.169 + if thisgroup != None and thisgroup.max + window >= id - window: 1.170 + thisgroup.max = id + window 1.171 + else: 1.172 + thisgroup = PostGroup() 1.173 + thisgroup.qwords = qwords 1.174 + thisgroup.min = id - window 1.175 + thisgroup.max = id + window 1.176 + groups.append( thisgroup ) 1.177 + thisgroup.score = max( thisgroup.score, scorehash[ id ] ) 1.178 + 1.179 + groups.sort( key = lambda g: g.score, reverse = True ) 1.180 + 1.181 + tmpl.postgroups = groups 1.182 1.183 req.set_header('Content-Type', 'text/html; charset=UTF-8') 1.184 req.write( tmpl.serialize( output=serializer ) ) 1.185 + 1.186 else: 1.187 cursor = opcommon.dbconn.dictcursor 1.188 tmpl = kid.Template( file='irclog.kid' )
2.1 --- a/_style.css Wed May 16 23:23:34 2007 -0400 2.2 +++ b/_style.css Thu May 17 02:03:12 2007 -0400 2.3 @@ -227,25 +227,8 @@ 2.4 display: block; 2.5 } 2.6 2.7 -#scrollframe { 2.8 - position: fixed; 2.9 - height: 100%; 2.10 - width: 6em; 2.11 - left: 0; 2.12 - background: #ccf; 2.13 - border-right: 1px #55b solid; 2.14 - font-size: 8pt; 2.15 -} 2.16 - 2.17 -#scrollframe .labels { 2.18 - position: absolute; 2.19 - width: 5.5em; 2.20 - overflow: visible; 2.21 -} 2.22 - 2.23 -#scrollframe .labels div { 2.24 - position: absolute; 2.25 - margin-left: 0.8em; 2.26 +.q { 2.27 + background: #ff0; 2.28 } 2.29 2.30 /*
3.1 --- a/ircsrp.kid Wed May 16 23:23:34 2007 -0400 3.2 +++ b/ircsrp.kid Thu May 17 02:03:12 2007 -0400 3.3 @@ -13,24 +13,27 @@ 3.4 <p> 3.5 Search results for: <b py:content="q" /> 3.6 </p> 3.7 + 3.8 <table class="irclog ircsrp"> 3.9 - <tbody> 3.10 - <tr py:for="entry in logentries" id="p${entry.id}" class="${entry.evenodd}" > 3.11 - <if py:if="entry.dateline" py:strip="True"> 3.12 - <td py:content="entry.dateline.strftime('%A, %e %B %Y 3.13 - %H:%M:%S')" colspan="4" class="dateline" /> 3.14 - </if> 3.15 - <if py:if="not entry.dateline" py:strip="True"> 3.16 - <td class="timestamp" 3.17 - py:content="entry.timestamp.strftime('%H:%M')" /> 3.18 - <td class="attrib" py:content="entry.attrib" /> 3.19 - <td><span class="pretext" py:if="entry.pretext" 3.20 - py:content="entry.pretext" 3.21 - /><span py:replace="entry.text" /> 3.22 - </td> 3.23 - </if> 3.24 + <tbody py:for="group in postgroups"> 3.25 + <tr> 3.26 + <td colspan="4" class="dateline"> 3.27 + <text py:replace="group.startdate.strftime('%A, %e %B %Y %H:%M:%S')"/> 3.28 + <a href="?start=${group.startcode}&amp;end=${group.endcode}">link</a> 3.29 + </td> 3.30 + </tr> 3.31 + <tr py:for="entry in group.posts()" id="p${entry.id}" 3.32 + class="${entry.evenodd}" > 3.33 + <td class="timestamp" 3.34 + py:content="entry.timestamp.strftime('%H:%M')" /> 3.35 + <td class="attrib" py:content="entry.attrib" /> 3.36 + <td><span class="pretext" py:if="entry.pretext" 3.37 + py:content="entry.pretext" 3.38 + /><span py:replace="entry.text" /> 3.39 + </td> 3.40 </tr> 3.41 </tbody> 3.42 </table> 3.43 + 3.44 </body> 3.45 </html>