From 80fa6e5327e00c313bba422673717a02e10f4a9f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 11 Jan 2022 22:09:49 +0530 Subject: [PATCH] [facebook] Improve title and uploader extraction Closes #1943, closes #795 --- yt_dlp/extractor/facebook.py | 51 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 52f1d0d6a5..0a1d614cbb 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -20,13 +20,13 @@ get_element_by_id, int_or_none, js_to_json, - limit_length, merge_dicts, network_exceptions, parse_count, parse_qs, qualities, sanitized_Request, + traverse_obj, try_get, url_or_none, urlencode_postdata, @@ -398,28 +398,31 @@ def _extract_from_url(self, url, video_id): url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) def extract_metadata(webpage): - video_title = self._html_search_regex( - r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, - 'title', default=None) - if not video_title: - video_title = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'alternative title', default=None) - if not video_title: - video_title = self._html_search_meta( - ['og:title', 'twitter:title', 'description'], - webpage, 'title', default=None) - if video_title: - video_title = limit_length(video_title, 80) - else: - video_title = 'Facebook video #%s' % video_id - description = self._html_search_meta( + media_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + media = traverse_obj(media_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'result', 'data', 'attachments', ..., 'media'), expected_type=dict) + media = [m for m in media if str(m.get('id')) == video_id and m.get('__typename') == 'Video'] + + video_title = traverse_obj(media, (..., 'title', 'text'), get_all=False) + description = traverse_obj(media, ( + ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False) + uploader = traverse_obj(media, (..., 'owner', 'name'), get_all=False) + uploader_id = traverse_obj(media, (..., 'owner', 'id'), get_all=False) + + video_title = video_title or self._html_search_regex(( + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), self._meta_regex('description'), + ), webpage, 'title', default=None, group='content') + description = description or self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) - uploader = clean_html(get_element_by_id( - 'fbPhotoPageAuthorName', webpage)) or self._search_regex( - r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', - default=None) or self._og_search_title(webpage, fatal=False) + uploader = uploader or ( + clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + or self._search_regex( + (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) + timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) @@ -434,17 +437,17 @@ def extract_metadata(webpage): r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', default=None)) info_dict = { - 'title': video_title, + 'title': video_title or description.replace('\n', ' ') or f'Facebook video #{video_id}', 'description': description, 'uploader': uploader, + 'uploader_id': uploader_id, 'timestamp': timestamp, 'thumbnail': thumbnail, 'view_count': view_count, } info_json_ld = self._search_json_ld(webpage, video_id, default={}) if info_json_ld.get('title'): - info_json_ld['title'] = limit_length( - re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80) + info_json_ld['title'] = re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']) return merge_dicts(info_json_ld, info_dict) video_data = None