Skip to content

Commit

Permalink
fixing PLAN B elements of title and URL.
Browse files Browse the repository at this point in the history
  • Loading branch information
tanimislam committed Oct 3, 2024
1 parent 49f61cf commit 2edced3
Showing 1 changed file with 57 additions and 27 deletions.
84 changes: 57 additions & 27 deletions nprstuff/core/freshair.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,13 +430,22 @@ def _get_title_url_here( article_elem ):
return { 'title' : candidate_title, 'url' : candidate_url }

def _get_title_url_here_something( elem ):
assert( 'title' in elem )
candidate_title = _fix_title( elem[ 'title' ].split(':')[1].strip( ) )
#
assert( 'audioUrl' in elem )
candidate_url_split = urlsplit( elem[ 'audioUrl' ] )
candidate_url = '%s://%s%s' % ( candidate_url_split.scheme, candidate_url_split.netloc, candidate_url_split.path )
return { 'title' : candidate_title, 'url' : candidate_url }
assert( 'title' in elem )
candidate_title = _fix_title( elem[ 'title' ].strip( ) )
#
try:
assert( 'audioUrl' in elem )
candidate_url_split = urlsplit( elem[ 'audioUrl' ] )
assert( candidate_url_split.scheme != '' )
candidate_url = '%s://%s%s' % ( candidate_url_split.scheme, candidate_url_split.netloc, candidate_url_split.path )
return { 'title' : candidate_title, 'url' : candidate_url }
except Exception as e:
pass
assert( 'storyUrl' in elem )
candidate_url_split = urlsplit( elem[ 'storyUrl' ] )
assert( candidate_url_split.scheme != '' )
candidate_url = '%s://%s%s' % ( candidate_url_split.scheme, candidate_url_split.netloc, candidate_url_split.path )
return { 'title' : candidate_title, 'url' : candidate_url }

def _plan_C2_get_title_url_here( story_elem ): # stuff tried out 2024-07-30
url_line_elems = list( story_elem.find_all( 'a' ) )
Expand Down Expand Up @@ -506,7 +515,7 @@ def _plan_C_get_title_url_here( elem ):
article_url_elem = article_url_elem[ 0 ]
article_url_split = urlsplit( article_url_elem['href'].strip( ) )
article_url = '%s://%s%s' % ( article_url_split.scheme, article_url_split.netloc, article_url_split.path )
logger.debug( 'URL TO GET = %s.' % article_url )
logger.info( 'URL TO GET = %s.' % article_url )
#
## first get the info
resp = requests.get( article_url )
Expand All @@ -526,29 +535,51 @@ def _plan_C_get_title_url_here( elem ):
map(_get_title_url_here, story_list_elem.find_all( 'article', { 'class' : 'rundown-segment' } ) ),
key = lambda entry: entry[ 'url' ] )
if len( article_infos_in_order ) == 0:
raise ValueError("FAILED PLAN A")
raise ValueError("FAILED PLAN A: length of articles = 0")
if any(map(lambda entry: len( entry['title'].strip( ) ) <= 20, article_infos_in_order ) ):
raise ValueError( "FAILED ON PLAN A")
logging.debug( 'PLAN A: %s' % article_infos_in_order )
raise ValueError( "FAILED ON PLAN A: length of any of the titles is <= 0")
logging.info( 'PLAN A: %s' % article_infos_in_order )
return list(map(lambda entry: ( entry['title'], entry['url'] ), article_infos_in_order))
except: pass
except Exception as e:
logging.info( "PROBLEM GETTING PLAN A: %s." % str( e ) )
pass
#
## otherwise PLAN B do the needful, see if this works...
try:
actual_elems = list(filter(lambda elem: 'data-play-all' in elem.attrs, myhtml.find_all('b')))
assert( len( actual_elems ) != 0 )
actual_elem = actual_elems[ 0 ]
data = json.loads( actual_elem['data-play-all'] )
assert( 'audioData' in data )
data_audio = data[ 'audioData' ]
#
article_infos_in_order = sorted(
map(_get_title_url_here_something, data_audio ),
key = lambda entry: entry[ 'url' ] )
if len( article_infos_in_order ) != 0:
return list(map(lambda entry: ( entry['title'], entry['url'] ), article_infos_in_order))
raise ValueError("FAILED PLAN B")
except: pass
actual_elems = list(filter(lambda elem: 'data-play-all' in elem.attrs, myhtml.find_all('b')))
assert( len( actual_elems ) != 0 )
actual_elem = actual_elems[ 0 ]
data = json.loads( actual_elem['data-play-all'] )
assert( 'audioData' in data )
data_audio = data[ 'audioData' ]
#
article_infos_in_order = sorted(
map(_get_title_url_here_something, data_audio ),
key = lambda entry: entry[ 'url' ] )
if len( article_infos_in_order ) == 0:
raise ValueError( "ERROR, PLAN B: length of articles = 0" )

#
## now perform more sewage-processing of crappy NPR Fresh Air results
bad_elems = list(filter(lambda elem: not elem['url'].endswith( '.mp3' ), article_infos_in_order ) )
if len( bad_elems ) == 1:
possible_story_urls = set(
map(lambda idx:
'https://ondemand.npr.org/anon.npr-mp3/npr/fa/%d/%02d/%s_fa_%02d.mp3' % (
date_s.year, date_s.month, date_s.strftime( '%Y%m%d' ), idx ), range(1, len( article_infos_in_order ) + 1 ) ) )
possible_story_rems = possible_story_urls - set(map(lambda elem: elem['url'], article_infos_in_order ) )
assert( len( possible_story_rems ) == 1 )
rem_story = max( possible_story_rems )
for elem in article_infos_in_order:
if elem['url'].endswith( '.mp3' ): continue
elem[ 'url' ] = rem_story
logging.info( 'PLAN B: %s' % article_infos_in_order )
return list(map(lambda entry: ( entry['title'], entry['url'] ), article_infos_in_order))

raise ValueError("FAILED PLAN B: length of articles = 0")
except Exception as e:
logging.info( "PROBLEM GETTING PLAN B: %s." % str( e ) )
pass
#
## otherwise PLAN C what the fuck??
# actual_elems = list(filter(lambda elem: 'data-embed-url' in elem.attrs and 'data-metrics-ga4' in elem.attrs, myhtml.find_all()))
Expand All @@ -558,7 +589,6 @@ def _plan_C_get_title_url_here( elem ):
map(_plan_C2_get_title_url_here, actual_elems ),
key = lambda entry: entry[ 'url' ] )
return list(map(lambda entry: ( entry['title'], entry['url'] ), article_infos_in_order))

except Exception as e:
logger.info( str( e ) )
return None
Expand Down

0 comments on commit 2edced3

Please sign in to comment.