Index database using tipuesearch

Indexing a small document database using Tipue Search a small js library.
search
database
javascript
Published

June 8, 2015

Optimizing search on websites with tipuesearch js

var tipuesearch = {"pages": [
{"title": "Welcome to JIVE - Guidelines | ACME", "text": "Welcome to the ACME Jive Collaboration platform Please read this document, which contains","tags": "Welcome to JIVE - Guidelines | ACME","url": "http://10.0.82.13/DOC-1001.html"},
{"title": "Datacenter FAQ - ACME ISO Hosting | ACME", "text": "Where are the datacenters located? Do you subcontract activities? We have several datacenters around","tags": "Datacenter FAQ - ACME ISO Hosting | ACME","url": "http://10.0.82.13/DOC-1002.html"},
{"title": "Customer Information template | ACME", "text": "This document will need to become the template that is used to create the information of the custome","tags": "Customer Information template | ACME","url": "http://10.0.82.13/DOC-1003.html"},
{"title": "ISO Customer List | ACME", "text": "Please add customers as content is created. Please ensure you add the name alphabetically. Insert fo","tags": "ISO Customer List | ACME","url": "http://10.0.82.13/DOC-1004.html"},
{"title": "Accenture Phillipines/SiACMEpore - APH - Hosted | ACME", "text": "CustomerAccenture Philippines/SiACMEporealso known asAPHISO Project CodeAPHContract Start date01 June","tags": "Accenture Phillipines/SiACMEpore - APH - Hosted | ACME","url": "http://10.0.82.13/DOC-1006.html"},
{"title": "KB: SAP logon screen hangs - Oracle | ACME", "text": "KeywordsORA 257 00257 ORA-00257 Archivelog system hangs logon login screenSymptomSAP logon screen ap","tags": "KB: SAP logon screen hangs - Oracle | ACME","url": "http://10.0.82.13/DOC-1007.html"},
]};

Creation of the index database using python

import urllib2.request
import glob, os
import codecs
import sys
import string
from bs4 import BeautifulSoup
#os.chdir("D:\MergedCopies_17042015\jive.ACME.com\docs\DOC-1008.html")

def paquillo():


    max_lenth = 100
    tag_start = 11

    output_file = open('D:\Documentos\index.json','w+')
    

    os.chdir("D:\MergedCopies_17042015\jive.ACME.com\docs")
    for file in glob.glob("DOC-[0-9][0-9][0-9][0-9].html"):
        print('Document: '+file)
        f = codecs.open(file,encoding='utf-8')
        doc = BeautifulSoup(f.read())
        if len(doc.select('.jive-rendered-content'))>0:
            text = doc.select('.jive-rendered-content')[0]
            text_lenth = len(text.get_text())
            if text_lenth>=max_lenth:
                content_text =  text.get_text()[:max_lenth]
            elif text_lenth==0:
                content_text = doc.title.string
            else :
                content_text = text.get_text()
        else :
            content_text = doc.title.string

        if len(doc.select('.jive-icon-med .jive-icon-folder'))>0:
            tags = doc.select('.jive-icon-med .jive-icon-folder')[0]
            tag_lenth = len(tags.get_text())
            if tag_lenth > tag_start:
                tag_text = tags[:-tag_start]
            else :
                tag_text = doc.title.string
        else :
            tag_text = doc.title.string

                    
                
        string_to_file =  ('{\"title\": \"'+string.replace(doc.title.string,'\"','').strip()+'\", \"text\": \"'+
              string.replace(content_text,'\"','').replace('\n','').strip()+'\",'+
              '\"tags\": \"'+string.replace(tag_text,'\"','').strip()+'\",'+
              '\"url\": \"' +  'http://10.0.82.13/' + file +'\"},').encode('utf-8')+'\n'           
        output_file.write(string_to_file)

    output_file.close()
    return 0

def main():
   # sys.setdefaultencoding('utf-8')
    #reload(sys)
    paquillo()

if __name__ == "__main__":
    main()

Resources

tipue search