omnipaste
changeset 106:ffd751a09aee
Complete fulltext search for irc posts near each other.
| author | Chouser <chouser@bluweb.com> |
|---|---|
| date | Thu May 17 02:03:12 2007 -0400 (5 years ago) |
| parents | 271c67c45c73 |
| children | 3fd0c6790170 |
| files | _log.cgi _style.css ircsrp.kid |
line diff
1.1 --- a/_log.cgi Wed May 16 23:23:34 2007 -0400
1.2 +++ b/_log.cgi Thu May 17 02:03:12 2007 -0400
1.3 @@ -172,9 +172,10 @@
1.4
1.5 class LogIter( object ):
1.6 __slots__ = [ 'dictcursor', 'result', 'dateline', 'lastdate', 'odd',
1.7 - 'startoffset', 'breaksum', 'runningsum' ]
1.8 + 'startoffset', 'breaksum', 'runningsum', 're_qwords' ]
1.9
1.10 - def __init__( self, dictcursor, startoffset = None, breaksum = None ):
1.11 + def __init__( self, dictcursor, startoffset = None, breaksum = None,
1.12 + qwords = None ):
1.13 self.dictcursor = dictcursor
1.14 self.dateline = None
1.15 self.lastdate = datetime.datetime( 1980, 1, 1 )
1.16 @@ -182,6 +183,10 @@
1.17 self.startoffset = startoffset
1.18 self.breaksum = breaksum
1.19 self.runningsum = sha.new()
1.20 + if qwords:
1.21 + self.re_qwords = re.compile( r'\b(%s)\b' % '|'.join( qwords ), re.I )
1.22 + else:
1.23 + self.re_qwords = None
1.24
1.25 def timestamp( self ):
1.26 return self.result['whenCreated']
1.27 @@ -216,6 +221,9 @@
1.28 text = re.sub('<', '<', text )
1.29 text = re.sub('>', '>', text )
1.30
1.31 + if self.re_qwords:
1.32 + text = self.re_qwords.sub( '<span class="q">\g<1></span>', text )
1.33 +
1.34 text = re.sub('\n', '<br />', text )
1.35
1.36 text = re.sub('\003(\d\d?)(.*?)(?=\003|\017|$)', colorize, text )
1.37 @@ -278,11 +286,68 @@
1.38 if self.breaksum and self.breaksum == self.runningsum.hexdigest():
1.39 break
1.40
1.41 + def simpleiter( self ):
1.42 + self.odd = True
1.43 + for self.result in self.dictcursor:
1.44 + yield self
1.45 +
1.46 def sum( self ):
1.47 for tmp in self.logentries():
1.48 pass
1.49 return self.runningsum.hexdigest()
1.50
1.51 +class PostGroup( object ):
1.52 + __slots__ = [ 'min', 'max', 'score', 'istartdate', 'istartcode', 'iendcode', 'qwords' ]
1.53 +
1.54 + def __init__( self ):
1.55 + self.score = 0
1.56 + self.istartdate = None
1.57 + self.istartcode = None
1.58 + self.iendcode = None
1.59 +
1.60 + def __repr__( self ):
1.61 + return '<PostGroup %d-%d, %0.4f>' % (self.min, self.max, self.score)
1.62 +
1.63 + def fetchdates( self ):
1.64 + cursor = opcommon.dbconn.cursor
1.65 + cursor.execute('''
1.66 + SELECT whenCreated
1.67 + FROM irclog
1.68 + WHERE id = %d
1.69 + OR id = %d
1.70 + ORDER BY id''' %
1.71 + ( self.min, self.max ))
1.72 + self.istartdate = cursor.fetchone()[0]
1.73 + self.istartcode = datetime2str( self.istartdate )
1.74 + self.iendcode = datetime2str( cursor.fetchone()[0] )
1.75 +
1.76 + def startdate( self ):
1.77 + if not self.istartdate:
1.78 + self.fetchdates()
1.79 + return self.istartdate
1.80 + startdate = property( startdate )
1.81 +
1.82 + def startcode( self ):
1.83 + if not self.istartcode:
1.84 + self.fetchdates()
1.85 + return self.istartcode
1.86 + startcode = property( startcode )
1.87 +
1.88 + def endcode( self ):
1.89 + if not self.iendcode:
1.90 + self.fetchdates()
1.91 + return self.iendcode
1.92 + endcode = property( endcode )
1.93 +
1.94 + def posts( self ):
1.95 + cursor = opcommon.dbconn.dictcursor
1.96 + cursor.execute('''
1.97 + SELECT *
1.98 + FROM irclog
1.99 + WHERE id BETWEEN %d AND %d''' %
1.100 + ( self.min, self.max ))
1.101 + return LogIter( cursor, qwords = self.qwords ).simpleiter()
1.102 +
1.103 class Handler(cgi.Handler):
1.104 def forcelogin(self, req, msg="Error: You must be logged in to view the log\n"):
1.105 req.set_header(
1.106 @@ -320,22 +385,78 @@
1.107 tmpl.topics = LogIter( cursor ).logentries()
1.108 req.set_header('Content-Type', 'text/html; charset=UTF-8')
1.109 req.write( tmpl.serialize( output=serializer ) )
1.110 +
1.111 elif 'q' in req.params:
1.112 if not authinfo:
1.113 self.forcelogin( req )
1.114 return
1.115
1.116 - cursor = opcommon.dbconn.dictcursor
1.117 + cursor = opcommon.dbconn.cursor
1.118 tmpl = kid.Template( file='ircsrp.kid' )
1.119 tmpl.homeurl = opcommon.opconf.ircurl
1.120 tmpl.q = req.params.get('q')
1.121 + qwords = req.params.get('q').split()
1.122
1.123 - cursor.execute('SELECT * FROM irclog WHERE MATCH( text ) AGAINST ( %s )',
1.124 - ( req.params.get('q') ) )
1.125 - tmpl.logentries = LogIter( cursor ).logentries()
1.126 + # get raw number of matching words per post id
1.127 + scorehash = {}
1.128 + datehash = {}
1.129 + for word in qwords:
1.130 + cursor.execute('''
1.131 + SELECT id, whenCreated
1.132 + FROM irclog
1.133 + WHERE MATCH( text ) AGAINST ( %s )
1.134 + ORDER BY id''', word )
1.135 +
1.136 + for tup in cursor:
1.137 + id = tup[0]
1.138 + scorehash[ id ] = 1 + scorehash.get( id, 0 )
1.139 + datehash[ id ] = tup[1]
1.140 +
1.141 + idlist = scorehash.keys()
1.142 + idlist.sort()
1.143 +
1.144 + # increase the score of posts that have other high-scoring posts nearby
1.145 + window = 5
1.146 + power = 1.2
1.147 +
1.148 + outrighti = 0
1.149 + for leftid in idlist:
1.150 + for righti in xrange( outrighti, len( idlist ) ):
1.151 + rightid = idlist[ righti ]
1.152 + if leftid - window > rightid:
1.153 + # righti is still too low
1.154 + outrighti = righti
1.155 + elif leftid + window < rightid:
1.156 + # righti is now too high
1.157 + break
1.158 + elif leftid == rightid:
1.159 + # multiple words on same id have already been counted
1.160 + pass
1.161 + else:
1.162 + factor = 1.0 / abs( leftid - rightid ) ** power
1.163 + scorehash[ leftid ] += float( scorehash[ rightid ] ) * factor
1.164 +
1.165 + # build and score groups of posts
1.166 + groups = []
1.167 + thisgroup = None
1.168 + for id in idlist:
1.169 + if thisgroup != None and thisgroup.max + window >= id - window:
1.170 + thisgroup.max = id + window
1.171 + else:
1.172 + thisgroup = PostGroup()
1.173 + thisgroup.qwords = qwords
1.174 + thisgroup.min = id - window
1.175 + thisgroup.max = id + window
1.176 + groups.append( thisgroup )
1.177 + thisgroup.score = max( thisgroup.score, scorehash[ id ] )
1.178 +
1.179 + groups.sort( key = lambda g: g.score, reverse = True )
1.180 +
1.181 + tmpl.postgroups = groups
1.182
1.183 req.set_header('Content-Type', 'text/html; charset=UTF-8')
1.184 req.write( tmpl.serialize( output=serializer ) )
1.185 +
1.186 else:
1.187 cursor = opcommon.dbconn.dictcursor
1.188 tmpl = kid.Template( file='irclog.kid' )
2.1 --- a/_style.css Wed May 16 23:23:34 2007 -0400
2.2 +++ b/_style.css Thu May 17 02:03:12 2007 -0400
2.3 @@ -227,25 +227,8 @@
2.4 display: block;
2.5 }
2.6
2.7 -#scrollframe {
2.8 - position: fixed;
2.9 - height: 100%;
2.10 - width: 6em;
2.11 - left: 0;
2.12 - background: #ccf;
2.13 - border-right: 1px #55b solid;
2.14 - font-size: 8pt;
2.15 -}
2.16 -
2.17 -#scrollframe .labels {
2.18 - position: absolute;
2.19 - width: 5.5em;
2.20 - overflow: visible;
2.21 -}
2.22 -
2.23 -#scrollframe .labels div {
2.24 - position: absolute;
2.25 - margin-left: 0.8em;
2.26 +.q {
2.27 + background: #ff0;
2.28 }
2.29
2.30 /*
3.1 --- a/ircsrp.kid Wed May 16 23:23:34 2007 -0400
3.2 +++ b/ircsrp.kid Thu May 17 02:03:12 2007 -0400
3.3 @@ -13,24 +13,27 @@
3.4 <p>
3.5 Search results for: <b py:content="q" />
3.6 </p>
3.7 +
3.8 <table class="irclog ircsrp">
3.9 - <tbody>
3.10 - <tr py:for="entry in logentries" id="p${entry.id}" class="${entry.evenodd}" >
3.11 - <if py:if="entry.dateline" py:strip="True">
3.12 - <td py:content="entry.dateline.strftime('%A, %e %B %Y
3.13 - %H:%M:%S')" colspan="4" class="dateline" />
3.14 - </if>
3.15 - <if py:if="not entry.dateline" py:strip="True">
3.16 - <td class="timestamp"
3.17 - py:content="entry.timestamp.strftime('%H:%M')" />
3.18 - <td class="attrib" py:content="entry.attrib" />
3.19 - <td><span class="pretext" py:if="entry.pretext"
3.20 - py:content="entry.pretext"
3.21 - /><span py:replace="entry.text" />
3.22 - </td>
3.23 - </if>
3.24 + <tbody py:for="group in postgroups">
3.25 + <tr>
3.26 + <td colspan="4" class="dateline">
3.27 + <text py:replace="group.startdate.strftime('%A, %e %B %Y %H:%M:%S')"/>
3.28 + <a href="?start=${group.startcode}&end=${group.endcode}">link</a>
3.29 + </td>
3.30 + </tr>
3.31 + <tr py:for="entry in group.posts()" id="p${entry.id}"
3.32 + class="${entry.evenodd}" >
3.33 + <td class="timestamp"
3.34 + py:content="entry.timestamp.strftime('%H:%M')" />
3.35 + <td class="attrib" py:content="entry.attrib" />
3.36 + <td><span class="pretext" py:if="entry.pretext"
3.37 + py:content="entry.pretext"
3.38 + /><span py:replace="entry.text" />
3.39 + </td>
3.40 </tr>
3.41 </tbody>
3.42 </table>
3.43 +
3.44 </body>
3.45 </html>
