Kod: #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = u'Lukk 2010, Łukasz Grąbczewski 2011' __version__ = '1.1'
''' Duzy Format http://wyborcza.pl/duzyformat '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.constants import config_dir, CONFIG_DIR_MODE import os, os.path, urllib, time from hashlib import md5
class duzyformat(BasicNewsRecipe): __author__ = u'Lukk, modified by Łukasz Grąbczewski' title = 'Duzy Format' description = u'Duży Format' language = 'pl_PL' publisher = 'Agora SA' publication_type = 'magazine' timefmt = ' [%a, %d %b %Y]' custom_title = u'Duży Format' + time.strftime(timefmt) conversion_options = { 'title' : custom_title ,'authors' : 'wyborcza.pl' ,'publisher' : publisher ,'language' : language ,'no_default_epub_cover' : True ,'preserve_cover_aspect_ratio': True } # check articles in last month oldest_article = 30 max_articles_per_feed = 1000 simultaneous_downloads = 20 timeout = 30 use_embedded_content = False remove_empty_feeds = True no_stylesheets = True remove_javascript = True extra_css = '.lead {font-weight: bold; } \ '
# official RSS link feeds = [(u'Duży Format', u'http://rss.gazeta.pl/pub/rss/gazetawyborcza_duzyformat.xml')]
# fix images for epub def postprocess_html(self, soup, first_fetch): return self.adeify_images(soup)
# newest cover def get_cover_url(self): soup = self.index_to_soup('http://wyborcza.pl/duzyformat') belka = soup.findAll(attrs={'class' : 'DFntldafE'}) cover_url = belka[0].find('img')['src'] return cover_url # change url to print version def print_version(self,url): baseURL='http://wyborcza.pl' segments = url.split('0H') subPath= '/2029020,' articleURL1 = segments[1] articleURL1 = articleURL1.replace('A', '') articleURL2 = segments[2] articleURL2 = articleURL2.replace('A', '')
printVerString=articleURL1 + ',' + articleURL2 s= baseURL + subPath + printVerString + '.html' return s
''' track downloaded articles copyright Pahan http://www.mobileread.com/forums/showpost.php?p=1295505 ''' def parse_feeds(self): recipe_dir = os.path.join(config_dir,'recipes') hash_dir = os.path.join(recipe_dir,'recipe_storage') feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':')) if not os.path.isdir(feed_dir): os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds: feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='') feed_fn = os.path.join(feed_dir,feed_hash)
past_items = set() if os.path.exists(feed_fn): with file(feed_fn) as f: for h in f: past_items.add(h.strip()) cur_items = set() for article in feed.articles[:]: item_hash = md5() if article.content: item_hash.update(article.content.encode('utf-8')) if article.summary: item_hash.update(article.summary.encode('utf-8')) item_hash = item_hash.hexdigest() if article.url: item_hash = article.url + ':' + item_hash cur_items.add(item_hash) if item_hash in past_items: feed.articles.remove(article) with file(feed_fn,'w') as f: for h in cur_items: f.write(h+'\n')
remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f)
return feeds
|