Skip to content

Commit

Permalink
Feedcore: Improve parsing of publishing date
Browse files Browse the repository at this point in the history
This works around problems with invalid date formats
and also prefers the "published" field over "updated".

See also Feedparser issue 327:
http://code.google.com/p/feedparser/issues/detail?id=327
  • Loading branch information
thp committed Feb 12, 2012
1 parent d0eaccb commit ee2a5b7
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 11 deletions.
39 changes: 39 additions & 0 deletions src/gpodder/feedcore.py
Expand Up @@ -24,6 +24,13 @@

import feedparser

try:
# Python 2
from rfc822 import mktime_tz
except ImportError:
# Python 3
from email.utils import mktime_tz

def patch_feedparser():
"""Monkey-patch the Universal Feed Parser"""
# Detect the 'plain' content type as 'text/plain'
Expand Down Expand Up @@ -262,3 +269,35 @@ def fetch(self, url, etag=None, modified=None):
"""
self._parse_feed(url, etag, modified)


def get_pubdate(entry):
"""Try to determine the real pubDate of a feedparser entry
This basically takes the updated_parsed value, but also uses some more
advanced techniques to work around various issues with ugly feeds.
"published" now also takes precedence over "updated" (with updated used as
a fallback if published is not set/available). RSS' "pubDate" element is
"updated", and will only be used if published_parsed is not available.
"""

pubdate = entry.get('published_parsed', None)

if pubdate is None:
pubdate = entry.get('updated_parsed', None)

if pubdate is None:
# See http://code.google.com/p/feedparser/issues/detail?id=327
updated = entry.get('published', entry.get('updated', None))
if updated is not None:
# FIXME: This is kludgy. We should write our own date handler
# and register it with feedparser.registerDateHandler() and/or
# wait for feedparser to add support for this bogus date format.
pubdate = feedparser._parse_date(updated.replace(',', ''))

if pubdate is None:
# Cannot determine pubdate - party like it's 1970!
return 0

return mktime_tz(pubdate + (0,))

13 changes: 2 additions & 11 deletions src/gpodder/model.py
Expand Up @@ -41,13 +41,6 @@
import time
import datetime

try:
# Python 2
from rfc822 import mktime_tz
except ImportError:
# Python 3
from email.utils import mktime_tz

import hashlib
import feedparser
import collections
Expand Down Expand Up @@ -218,8 +211,7 @@ def from_feedparser_entry(cls, entry, channel):
if not episode.description:
episode.description = entry.get('subtitle', '')

if entry.get('updated_parsed', None):
episode.published = mktime_tz(entry.updated_parsed+(0,))
episode.published = feedcore.get_pubdate(entry)

enclosures = entry.get('enclosures', [])
media_rss_content = entry.get('media_content', [])
Expand Down Expand Up @@ -1011,8 +1003,7 @@ def _consume_updated_feed(self, feed, max_episodes=0):
# max_episodes old episodes, new episodes will not be shown.
# See also: gPodder Bug 1186
try:
entries = sorted(feed.entries, \
key=lambda x: x.get('updated_parsed', (0,)*9), \
entries = sorted(feed.entries, key=feedparser.get_pubdate,
reverse=True)[:max_episodes]
except Exception, e:
logger.warn('Could not sort episodes: %s', e, exc_info=True)
Expand Down

0 comments on commit ee2a5b7

Please sign in to comment.