Kod: # -*- coding: utf-8 -*- #!/usr/bin/env python
__license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile import datetime
class Newsweek(BasicNewsRecipe): EDITION = '0' DATE = None YEAR = datetime.datetime.now().year
title = u'Newsweek Polska' __author__ = 'matek09' description = 'Weekly magazine <br>' #(kk) encoding = 'utf-8' language = 'pl' remove_javascript = True masthead_url = 'http://ipn.blox.pl/resource/newsweek_logo_250x250.jpg' #(kk)
temp_files = [] articles_are_obfuscated = True
def get_obfuscated_article(self, url): br = self.get_browser() br.open(url) source = br.response().read() page = self.index_to_soup(source)
main_section = page.find(id='mainSection') title = main_section.find('h1') info = main_section.find('ul', attrs={'class' : 'articleInfo'}) authors = info.find('li').find('h4') article = main_section.find('div', attrs={'id' : 'article'}) if article.find('div', attrs={'class':'relatedBox'}) is not None: article.find('div', attrs={'class':'relatedBox'}).replaceWith('') html = unicode(title) + unicode(authors) + unicode(article) next = main_section.find('li', attrs={'class' : 'next'}) while next: url = next.find('a')['href'] br.open(url) source = br.response().read() page = self.index_to_soup(source) main_section = page.find(id='mainSection') article = main_section.find('div', attrs={'id' : 'article'}) aside = article.find(id='articleAside') if aside is not None: aside.extract() html = html + unicode(article) next = main_section.find('li', attrs={'class' : 'next'}) self.temp_files.append(PersistentTemporaryFile('_temparse.html')) self.temp_files[-1].write(html) self.temp_files[-1].close() return self.temp_files[-1].name def is_full(self, issue_soup): while True: main_section = issue_soup.find(id='mainSection') next = main_section.find('li', attrs={'class' : 'next'}) if len(main_section.findAll(attrs={'class' : 'locked'})) > 6: #KK Ile może być zablokowanych artykułów return False elif next is None: return True else: issue_soup = self.index_to_soup(next.find('a')['href'])
def find_last_full_issue(self, archive_url): archive_soup = self.index_to_soup(archive_url) select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'}) for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')): self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','') issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) if self.is_full(issue_soup): return self.YEAR = self.YEAR - 1 self.find_last_full_issue(archive_url + ',' + str(self.YEAR)) def parse_index(self): archive_url = 'http://www.newsweek.pl/wydania/archiwum' self.find_last_full_issue(archive_url) soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'})) main_section = soup.find(id='mainSection') img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title')) self.cover_url = img['src'] feeds = [] articles = {} sections = [] while True: news_list = main_section.find('ul', attrs={'class' : 'newsList'}) for h2 in news_list.findAll(['h2','h3']): if h2.a is None : section = self.tag_to_string(h2) continue if section == '' : continue if not articles.has_key(section) : sections.append(section) articles[section] = [] article = self.create_article(h2) articles[section].append(article) next = main_section.find('li', attrs={'class' : 'next'}) if next is None: break soup = self.index_to_soup(next.find('a')['href']) main_section = soup.find(id='mainSection') for section in sections: feeds.append((section, articles[section])) return feeds
def create_article(self, h2): article = {} a = h2.find('a') if a is not None: #(kk) dodałem sprawdzanie warunku article['title'] = self.tag_to_string(a) article['url'] = a['href'] article['date'] = self.DATE article['author'] = self.tag_to_string(h2.findNext('p') ) #(kk) dodałem
article_page = self.index_to_soup(article['url']) #(kk) dodałem szukanie description article['description'] = self.tag_to_string(article_page.find('div', attrs={'class' : ['art_lead', 'lead']}) ) return article
def populate_article_metadata(self, article, soup, first): #(kk) dodałem if first and hasattr(self, 'add_toc_thumbnail'): picdiv = soup.find('img') if picdiv is not None: self.add_toc_thumbnail(article,picdiv['src'])
|