Tags are now parsed during import, and inserted into the index as an array field.

Also changed names of multivalued Solr fields so they are plural.
2025-04-03 08:13:26 +00:00 · 2011-11-06 18:02:06 +11:00 · 2011-11-06 18:02:06 +11:00 · 045b50fe6c
commit 045b50fe6c
parent 098a4f2fa9
4 changed files with 8 additions and 24 deletions
--- a/java/solr/server/solr/conf/schema.xml
+++ b/java/solr/server/solr/conf/schema.xml
@ -505,13 +505,13 @@
   <!-- TODO: the title field should be boosted at index-time -->
   <field name="title" type="text_general" indexed="true" stored="false" required="true"/>
   <field name="question-json" type="string" indexed="false" stored="true" required="true"/>
-   <field name="answer-json" type="string" indexed="false" stored="true" multiValued="true"/>
+   <field name="answers-json" type="string" indexed="false" stored="true" multiValued="true"/>
   <field name="ownerUserId" type="tint" indexed="true" stored="true" required="true"/>
   <field name="lastEditorUserId" type="tint" indexed="false" stored="false"/>
   <field name="lastActivityDate" type="tdate" indexed="true" stored="false"/>
   <field name="communityOwnedDate" type="tdate" indexed="false" stored="false"/>
   <field name="closedDate" type="tdate" indexed="false" stored="false"/>
-   <field name="tag" type="string" indexed="true" stored="false" multiValued="true"/>
+   <field name="tags" type="string" indexed="true" stored="false" multiValued="true"/>

   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
--- a/python/src/stackdump/app.py
+++ b/python/src/stackdump/app.py
@ -27,21 +27,6 @@ MEDIA_ROOT = os.path.abspath(BOTTLE_ROOT + '/../../media')
 thread_locals = threading.local()


-# CUSTOM TEMPLATE TAGS AND FILTERS
-
-def parse_se_tags(value):
-    '''\
-    Parses the string of tags as given in the StackExchange XML site dump. The
-    format is:
-    
-    <feature-request><filter>
-    '''
-    # if it isn't a string, just do nothing
-    if not isinstance(value, basestring):
-        return value
-    
-    return re.findall(r'<([^>]+)>', value)
-
 # RESOURCE DECORATORS

 def uses_templates(fn):
@ -62,7 +47,6 @@ def uses_templates(fn):
                # template.
                extensions=['jinja2.ext.autoescape']
            )
-            thread_locals.template_env.filters['parse_se_tags'] = parse_se_tags
    
    if not fn:
        init_templates()
--- a/python/src/stackdump/dataproc/insert.py
+++ b/python/src/stackdump/dataproc/insert.py
@ -201,7 +201,7 @@ class PostContentHandler(xml.sax.ContentHandler):
            AnswerCount="3" CommentCount="1" FavoriteCount="3" />

    """
-    TAGS_RE = re.compile(u'&lt;([\w\d\-]+)&gt;')
+    TAGS_RE = re.compile(u'<([^>]+)>')
    
    def __init__(self, site):
        self.site = site
@ -385,7 +385,7 @@ class PostContentHandler(xml.sax.ContentHandler):
        doc['text'] = search_text
        
        # serialise answers to JSON
-        doc['answer-json'] = [ json.dumps(a, default=self.json_default_handler) for a in q['answers'] ]
+        doc['answers-json'] = [ json.dumps(a, default=self.json_default_handler) for a in q['answers'] ]
        
        # map other fields to search index doc
        doc['id'] = str(q['id'])
@ -427,7 +427,7 @@ class PostContentHandler(xml.sax.ContentHandler):
            question_obj['closedDate'] = q['closedDate']
        question_obj['title'] = q['title']
        if 'tags' in q:
-            question_obj['tags'] = q['tags']
+            question_obj['tags'] = PostContentHandler.TAGS_RE.findall(q['tags'])
        question_obj['favoriteCount'] = q['favoriteCount']
        question_obj['comments'] = q['comments']
        
--- a/python/src/stackdump/templates/results.html
+++ b/python/src/stackdump/templates/results.html
@ -29,8 +29,8 @@
                            <p>vote{% if r.question.score != 1 %}s{% endif %}</p>
                        </div>
                        <div class="post-stat">
-                            <p class="post-stat-value">{{ r.answer|length }}</p>
-                            <p>answer{% if r.answer|length != 1 %}s{% endif %}</p>
+                            <p class="post-stat-value">{{ r.answers|length }}</p>
+                            <p>answer{% if r.answers|length != 1 %}s{% endif %}</p>
                        </div>
                    </div>
                    <div class="post-summary">
@ -41,7 +41,7 @@
                            <strong>{{ r.question.creationDate }}</strong>.
                        </p>
                        <div class="post-tags">
-                            {% for t in r.question.tags|parse_se_tags %}
+                            {% for t in r.question.tags %}
                            <span class="label">{{ t }}</span>
                            {% endfor %}
                        </div>