From 3630cd542375f52c82f05478228075e2b8e5fd1f Mon Sep 17 00:00:00 2001 From: Kanu Gaba Date: Fri, 23 Apr 2021 17:07:14 -0400 Subject: [PATCH] [patreon:user] Add new extractor --- docs/supportedsites.md | 1 + youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/patreon.py | 81 ++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a23da1a31..415731da0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -691,6 +691,7 @@ - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos - **Patreon** + - **Patreon:user**: Audio posts by user - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **PearVideo** - **PeerTube** diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ac33cd996..ea2b755c5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -895,7 +895,10 @@ from .palcomp3 import ( ) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE -from .patreon import PatreonIE +from .patreon import ( + PatreonIE, + PatreonUserIE, +) from .pbs import PBSIE from .pearvideo import PearVideoIE from .peertube import PeerTubeIE diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 761a4b1de..c4bc76e06 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -13,6 +13,14 @@ from ..utils import ( try_get, ) +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.common.exceptions import TimeoutException + +import re + class PatreonIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+)' @@ -154,3 +162,76 @@ class PatreonIE(InfoExtractor): }) return info + + +class PatreonUserIE(PatreonIE): + IE_NAME = 'Patreon:user' + IE_DESC = 'Audio posts by user' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P\w+)(?!.)' + _TESTS = [ + # Standard + { + 'url': 'https://www.patreon.com/joshuacitarella', + 'info_dict': { + 'id': 'joshuacitarella', + 'title': "joshuacitarella's audio posts", + }, + 'playlist_mincount': 4, + }, + # All Private + { + 'url': 'https://www.patreon.com/juicysoup', + 'info_dict': { + 'id': 'juicysoup', + 'title': "juicysoup's audio posts", + }, + 'playlist_mincount': 0, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Start Selenium Chromedriver + options = webdriver.chrome.options.Options() + options.add_argument("headless") + driver = webdriver.Chrome(options=options) + driver.get(url) + delay = 10 # Seconds + + try: + # Wait for audio elements to load + WebDriverWait(driver, delay).until( + EC.presence_of_element_located((By.TAG_NAME, 'audio'))) + webpage = driver.page_source + + # Find unique elements matching regex + elements = re.findall( + r'(?Phttps?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+))', + webpage) + hrefs = set() + for element in elements: + hrefs.add(element) + + # Check whether elements are an audio post + real_hrefs = [] + for href in hrefs: + post = self._download_json( + 'https://www.patreon.com/api/posts/' + href[1], href[1]) + post_type = post.get('data').get('attributes').get('post_type') + if post_type == 'audio_file': + real_hrefs.append(href[0]) + else: + print("Not an audio post: {}".format(href[0])) + + except TimeoutException: + print("Loading took too much time or no audio files found!") + driver.quit() + return self.playlist_result([], video_id, video_id + "'s audio posts") + + # Create list of info dicts + entries = [self.url_result(link, PatreonIE.ie_key()) for link in real_hrefs] + + # Clean up and return playlist object + driver.quit() + return self.playlist_result(entries, video_id, video_id + "'s audio posts")