ویکی‌پدیا:درخواست‌های ربات/ربات استخراج جعبه از درون مقاله/ویرایش 0

این ربات فهرستی از صفحه‌ها در ویکی‌پدیای انگلیسی را دریافت می‌کند، به کمک ابزار کمک مترجم یا روش‌های دیگر پیوندها را به فارسی برمیگرداند و خروجی مناسب را برای ربات pagefromfile.py تولید می‌کند که به واسطهٔ این ربات محتوای فایل تولیدشده در ویکی‌پدیای فارسی بارگذاری می‌شود.

پیش از اجرای ربات ویرایش

به یکی از دو روش توضیح داده‌شده در این بخش خروجی XML را دریافت کنید.
مطمئن شوید encoding فایل دریافتی UTF-8 باشد.
نام فایل را به ExportFromWiki.txt تغییر دهید.

روند کار ویرایش

ربات را اجرا کنید.
پس از تولید کامل و صحیح فایل ExportFromWikiResult.txt به واسطهٔ مرحلهٔ پیش، ربات pagefromfile.py را به شکل دستور ذکرشده در زیر اجرا کنید.

python pagefromfile.py -file:ExportFromWikiResult.txt -start:@@@ -end:@@@ -notitle -summary:'ربات:درون‌ریزی الگو' -pt:0
python pwb.py pagefromfile -file:ExportFromWikiResult.txt -start:@@@ -end:@@@ -notitle -summary:'ربات:درون‌ریزی الگو' -pt:0

ترفند ویرایش

برای ترجمهٔ عنوان‌هایی که در ویکی‌پدیای فارسی وجود ندارند، می‌توانید از متغیر listp استفاده کنید.

دریافت خروجی XML ویرایش

به صفحهٔ برون‌ریزی از ویکی‌پدیای انگلیسی مراجعه کنید، اکنون یکی از دو روش زیر را برگزینید:
1. برای دریافت خروجی از رده، نام رده را در فیلد نخست وارد کنید.
2. برای دریافت خروجی به طور مستقیم از نام صفحه‌ها، داده‌ها را در فیلد دوم وارد کنید (هر نام در یک خط).
مطمئن شوید گزینه‌های Include only the current revision, not the full history و Save as file فعال باشد.
روی دکمهٔ Export کلیک کنید.

برای اطلاعات بیشتر ویرایش

کاربر:Reza1615/درون‌ریزی را برای اطلاعات بیشتر مطالعه کنید.

کد ربات ویرایش

#!/usr/bin/python
# -*- coding: utf-8  -*-
#
# Reza(User:reza1615), 2011
#
# Distributed under the terms of the CC-BY-SA 3.0 .
#!/usr/bin/python
# -*- coding: utf-8  -*-
import query
import pagegenerators,re
import wikipedia,codecs
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
# Translate first string to second string
# For example: u'example', u'نمونه'
listp=()

def englishdictionry( enlink ,firstsite,secondsite):
    try:
        enlink=unicode(str(enlink),'UTF-8').replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'fa:',u'')
    except:
        enlink=enlink.replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'fa:',u'')
    if enlink.find('#')!=-1:
        return False
    if enlink==u'':
        return False    
    enlink=enlink.replace(u' ',u'_')
    site = wikipedia.getSite(firstsite)
    sitesecond= wikipedia.getSite(secondsite)
    params = {
        'action': 'query',
        'prop': 'langlinks',
        'titles': enlink,
        'redirects': 1,
        'lllimit':500,
    }
    try:
        categoryname = query.GetData(params,site)  
        for item in categoryname[u'query'][u'pages']:
            case=categoryname[u'query'][u'pages'][item][u'langlinks']
        for item in case:
            if item[u'lang']==secondsite:
                intersec=item[u'*']
                break
        result=intersec
        if result.find('#')!=-1:
            return False
        return result
    except: 
        return False
def linktranslation():
        wikipedia.output(u'link translating is started')
        text2 = codecs.open( u'ExportFromWikiResult.txt','r' ,'utf8' )
        text = text2.read()
        for i in range(0,len(listp),2):    
            text=text.replace(u"'''"+listp[i]+u"'''",u"'''"+listp[i+1]+u"'''")
        with codecs.open( u'ExportFromWikiResult.txt',mode = 'w',encoding = 'utf8' ) as f:
                    f.write( text )
        linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
        counter=0
        for item in linken:
                counter+=1
                itemmain=item
                item=item.replace(u'en:',u'')
                if item.find(u'user:')!=-1 or item.find(u'ه')!=-1 or item.find(u'ر')!=-1 or item.find(u'س')!=-1 or item.find(u'ف')!=-1 or item.find(u'ا')!=-1 or item.find(u'ن')!=-1 or item.find(u'ی')!=-1 or item.find(u'آ')!=-1 or item.find(u'ب')!=-1 or item.find(u'م')!=-1 or item.find(u'User:')!=-1 or item.find(u'file:')!=-1 or item.find(u'File:')!=-1 or item.find(u'template:')!=-1 or item.find(u'Template:')!=-1 or item.find(u'Wikipedia:')!=-1 or item.find('wikipedia:')!=-1 or item.find(u'Talk:')!=-1 or item.find(u'talk:')!=-1 or item.find(u'Help:')!=-1 or item.find(u'help:')!=-1:
                    continue
                itemen=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
                if text.find(itemmain)!=-1:
                    itemfa=englishdictionry(itemen ,'en','fa')
                    wikipedia.output(itemen)
                else:
                    continue
                if itemfa==False:
                    itemen=item.replace(u'[[',u'').replace(u']]',u'').strip()
                    itemen=itemen.replace(u'[[',u'').replace(u']]',u'')
                    text=text.replace(u'[['+itemen+u']]',u'@1@'+itemen+u'@2@')
                    continue
                else:
                    text=text.replace(itemmain,u'@1@'+itemfa+u'@2@')
                linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
                wikipedia.output(u'\03{lightred}----'+str(counter)+u'/'+str(len(linken))+u'----\03{default}')
                wikipedia.output(itemen)    
        text=text.replace(u'@1@',u'[[').replace(u'@2@',u']]')
        text=text.replace(u'$$$[[',u"'''").replace(u']]$$$',u"'''")
        with codecs.open( u'ExportFromWikiResult.txt',mode = 'w',encoding = 'utf8' ) as f:
                    f.write( text )
def run():
    count = 0
    filesample = 'ExportFromWiki.txt' #---XMl that came from other wiki 
    text2 = codecs.open( filesample,'r' ,'utf8' )
    text = text2.read()
    with codecs.open( 'ExportFromWikiResult.txt',mode = 'w',encoding = 'utf8' ) as f:# 
                    f.write( u'\n' )
    for pag in text.split( u'<page>' ):
        section1 = {}
        count += 1
        if count==1:
            continue
        title = pag.split( u'</title>' )[0].replace( u'<title>','' ).strip()
        section = pag.split( u'<revision>' )[-1]
        try:
            section = section.split( u'<text xml:space="preserve" bytes=' )[1]
            section = section.split( u'>' )[1]
            section = section.split( u'</text' )[0]
        except:
            a = 1
        section = section.replace( u'&lt;',u'<' )
        section = section.replace( u'&gt;',u'>' )
        section = section.replace( u'&quot;',u'"' )
        section = section.replace( u'â€¢',u'•' )
        section = section.replace( u'â€“',u'-' )
        section = section.replace( u'أ£',u'ã' )
        section = section.replace( u'أ©',u'é' )
        section = section.replace( u'أ',u'í' )
        section = section.replace( u'أ´',u'ô' )
        title = title.replace( u'أ£',u'ã' )
        title = title.replace( u'أ©',u'é' )
        title = title.replace(u'أ',u'í' )
        title = title.replace( u'أ´',u'ô' )
        resultdata = "\n@@@\n'''%s'''\n" % ( title ) + section+u'\n@@@'
        with codecs.open( 'ExportFromWikiResult.txt',mode = 'a',encoding = 'utf8' ) as f:# 
                    f.write( resultdata )
        f.close()
    wikipedia.output(u'\03{lightred}text filtering is done!\03{default}')
    wikipedia.output(u'\03{lightgreen}--------------------------------------------\03{default}')
def main():
    run()
    linktranslation()
    
if __name__ == '__main__':
        main()