Update channel playlists to use polymer

This commit is contained in:
Omar Roth 2020-06-16 17:51:35 -05:00
parent 074497b0f6
commit 056e7432bd
No known key found for this signature in database
GPG Key ID: B8254FB7EC3D37F2
5 changed files with 47 additions and 386 deletions

View File

@ -41,7 +41,7 @@ describe "Helper" do
describe "#extract_channel_playlists_cursor" do
it "correctly extracts a playlists cursor from the given URL" do
extract_channel_playlists_cursor("/browse_ajax?continuation=4qmFsgLRARIYVUNDajk1NklGNjJGYlQ3R291c3phajl3GrQBRWdsd2JHRjViR2x6ZEhNWUF5QUJNQUk0QVdBQmFnQjZabEZWYkZCaE1XczFVbFpHZDJGV09XNWxWelI0V0RGR2VWSnVWbUZOV0Vwc1ZHcG5lRmd3TVU1aVZXdDRWMWN4YzFGdFNuTmtlbWh4VGpCd1NWTllVa1pTYTJNeFlVUmtlRmt3Y0ZWVWJWRXdWbnBzTkU1V1JqRmhNVGxFVm14dmQwMXFhRzVXZDdnQkFBJTNEJTNE&gl=US&hl=en", false).should eq("AIOkY9EQpi_gyn1_QrFuZ1reN81_MMmI1YmlBblw8j7JHItEFG5h7qcJTNd4W9x5Quk_CVZ028gW")
extract_channel_playlists_cursor("4qmFsgLRARIYVUNDajk1NklGNjJGYlQ3R291c3phajl3GrQBRWdsd2JHRjViR2x6ZEhNWUF5QUJNQUk0QVdBQmFnQjZabEZWYkZCaE1XczFVbFpHZDJGV09XNWxWelI0V0RGR2VWSnVWbUZOV0Vwc1ZHcG5lRmd3TVU1aVZXdDRWMWN4YzFGdFNuTmtlbWh4VGpCd1NWTllVa1pTYTJNeFlVUmtlRmt3Y0ZWVWJWRXdWbnBzTkU1V1JqRmhNVGxFVm14dmQwMXFhRzVXZDdnQkFBJTNEJTNE", false).should eq("AIOkY9EQpi_gyn1_QrFuZ1reN81_MMmI1YmlBblw8j7JHItEFG5h7qcJTNd4W9x5Quk_CVZ028gW")
end
end

View File

@ -216,30 +216,18 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil)
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated)
response = YT_POOL.client &.get(url)
videos = [] of SearchVideo
begin
json = JSON.parse(response.body)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
raise "Could not extract JSON" if !initial_data
videos = extract_videos(initial_data.as_h, author, ucid)
rescue ex
if response.body.includes?("To continue with your YouTube experience, please fill out the form below.") ||
response.body.includes?("https://www.google.com/sorry/index")
raise "Could not extract channel info. Instance is likely blocked."
end
raise "Could not extract JSON"
end
if json["content_html"]? && !json["content_html"].as_s.empty?
document = XML.parse_html(json["content_html"].as_s)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
if auto_generated
videos = extract_videos_html(nodeset)
else
videos = extract_videos_html(nodeset, ucid, author)
end
end
videos ||= [] of ChannelVideo
rss.xpath_nodes("//feed/entry").each do |entry|
video_id = entry.xpath_node("videoid").not_nil!.content
title = entry.xpath_node("title").not_nil!.content
@ -305,24 +293,11 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil)
loop do
url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated)
response = YT_POOL.client &.get(url)
json = JSON.parse(response.body)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
raise "Could not extract JSON" if !initial_data
videos = extract_videos(initial_data.as_h, author, ucid)
if json["content_html"]? && !json["content_html"].as_s.empty?
document = XML.parse_html(json["content_html"].as_s)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
else
break
end
nodeset = nodeset.not_nil!
if auto_generated
videos = extract_videos_html(nodeset)
else
videos = extract_videos_html(nodeset, ucid, author)
end
count = nodeset.size
count = videos.size
videos = videos.map { |video| ChannelVideo.new(
id: video.id,
title: video.title,
@ -387,23 +362,11 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by)
url = produce_channel_playlists_url(ucid, continuation, sort_by, auto_generated)
response = YT_POOL.client &.get(url)
json = JSON.parse(response.body)
if json["load_more_widget_html"].as_s.empty?
continuation = nil
else
continuation = XML.parse_html(json["load_more_widget_html"].as_s)
continuation = continuation.xpath_node(%q(//button[@data-uix-load-more-href]))
if continuation
continuation = extract_channel_playlists_cursor(continuation["data-uix-load-more-href"], auto_generated)
end
end
html = XML.parse_html(json["content_html"].as_s)
nodeset = html.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
continuation = response.body.match(/"continuation":"(?<continuation>[^"]+)"/).try &.["continuation"]?
initial_data = JSON.parse(response.body).as_a.find(&.["response"]?).try &.as_h
else
url = "/channel/#{ucid}/playlists?disable_polymer=1&flow=list&view=1"
url = "/channel/#{ucid}/playlists?flow=list&view=1"
case sort_by
when "last", "last_added"
@ -416,21 +379,13 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by)
end
response = YT_POOL.client &.get(url)
html = XML.parse_html(response.body)
continuation = html.xpath_node(%q(//button[@data-uix-load-more-href]))
if continuation
continuation = extract_channel_playlists_cursor(continuation["data-uix-load-more-href"], auto_generated)
end
nodeset = html.xpath_nodes(%q(//ul[@id="browse-items-primary"]/li[contains(@class, "feed-item-container")]))
continuation = response.body.match(/"continuation":"(?<continuation>[^"]+)"/).try &.["continuation"]?
initial_data = extract_initial_data(response.body)
end
if auto_generated
items = extract_shelf_items(nodeset, ucid, author)
else
items = extract_items_html(nodeset, ucid, author)
end
return [] of SearchItem, nil if !initial_data
items = extract_items(initial_data)
continuation = extract_channel_playlists_cursor(continuation, auto_generated) if continuation
return items, continuation
end
@ -530,9 +485,8 @@ def produce_channel_playlists_url(ucid, cursor, sort = "newest", auto_generated
return "/browse_ajax?continuation=#{continuation}&gl=US&hl=en"
end
def extract_channel_playlists_cursor(url, auto_generated)
cursor = URI.parse(url).query_params
.try { |i| URI.decode_www_form(i["continuation"]) }
def extract_channel_playlists_cursor(cursor, auto_generated)
cursor = URI.decode_www_form(cursor)
.try { |i| Base64.decode(i) }
.try { |i| IO::Memory.new(i) }
.try { |i| Protodec::Any.parse(i) }
@ -949,25 +903,19 @@ def get_60_videos(ucid, author, page, auto_generated, sort_by = "newest")
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
break if !initial_data
videos.concat extract_videos(initial_data.as_h)
videos.concat extract_videos(initial_data.as_h, author, ucid)
end
return videos.size, videos
end
def get_latest_videos(ucid)
videos = [] of SearchVideo
url = produce_channel_videos_url(ucid, 0)
response = YT_POOL.client &.get(url)
json = JSON.parse(response.body)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
return [] of SearchVideo if !initial_data
author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s
items = extract_videos(initial_data.as_h, author, ucid)
if json["content_html"]? && !json["content_html"].as_s.empty?
document = XML.parse_html(json["content_html"].as_s)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
videos = extract_videos_html(nodeset, ucid)
end
return videos
return items
end

View File

@ -313,32 +313,30 @@ def html_to_content(description_html : String)
return description
end
def extract_videos(initial_data : Hash(String, JSON::Any))
extract_items(initial_data).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo))
def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil)
extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo))
end
def extract_items(initial_data : Hash(String, JSON::Any))
def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil)
items = [] of SearchItem
initial_data.try { |t|
t["contents"]? || t["response"]?
}.try { |t|
t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a[0]?.try &.["tabRenderer"]["content"] ||
initial_data.try { |t| t["contents"]? || t["response"]? }
.try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] ||
t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] ||
t["continuationContents"]?
}.try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? }
.try &.["contents"]
.as_a.each { |c|
c.try &.["itemSectionRenderer"]["contents"].as_a
.try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a || t }
t["continuationContents"]? }
.try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? }
.try &.["contents"].as_a
.each { |c| c.try &.["itemSectionRenderer"]["contents"].as_a
.try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a ||
t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t }
.each { |item|
if i = item["videoRenderer"]?
video_id = i["videoId"].as_s
title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || ""
author_info = i["ownerText"]?.try &.["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || ""
author = author_info.try &.["text"].as_s || author_fallback || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || ""
published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local
view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64
@ -382,8 +380,8 @@ def extract_items(initial_data : Hash(String, JSON::Any))
premiere_timestamp: premiere_timestamp
)
elsif i = item["channelRenderer"]?
author = i["title"]["simpleText"]?.try &.as_s || ""
author_id = i["channelId"]?.try &.as_s || ""
author = i["title"]["simpleText"]?.try &.as_s || author_fallback || ""
author_id = i["channelId"]?.try &.as_s || author_id_fallback || ""
author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try { |u| "https:#{u["url"]}" } || ""
subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0
@ -409,9 +407,9 @@ def extract_items(initial_data : Hash(String, JSON::Any))
video_count = i["videoCount"]?.try &.as_s.to_i || 0
playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || ""
author_info = i["shortBylineText"]["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || ""
author_info = i["shortBylineText"]?.try &.["runs"].as_a[0]?
author = author_info.try &.["text"].as_s || author_fallback || ""
author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || ""
videos = i["videos"]?.try &.as_a.map do |v|
v = v["childVideoRenderer"]
@ -444,297 +442,11 @@ def extract_items(initial_data : Hash(String, JSON::Any))
elsif i = item["horizontalCardListRenderer"]?
elsif i = item["searchPyvRenderer"]? # Ad
end
}
}
} }
items
end
def extract_videos_html(nodeset, ucid = nil, author_name = nil)
extract_items_html(nodeset, ucid, author_name).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo))
end
def extract_items_html(nodeset, ucid = nil, author_name = nil)
# TODO: Make this a 'CommonItem', so it makes more sense to be used here
items = [] of SearchItem
nodeset.each do |node|
anchor = node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a))
if !anchor
next
end
title = anchor.content.strip
id = anchor["href"]
if anchor["href"].starts_with? "https://www.googleadservices.com"
next
end
author_id = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a)).try &.["href"].split("/")[-1] || ucid || ""
author = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a)).try &.content.strip || author_name || ""
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")])).try &.to_s || ""
tile = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-tile")]))
if !tile
next
end
case tile["class"]
when .includes? "yt-lockup-playlist"
plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"]
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-meta")]/a))
if !anchor
anchor = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li/a))
end
video_count = node.xpath_node(%q(.//span[@class="formatted-video-count-label"]/b)) ||
node.xpath_node(%q(.//span[@class="formatted-video-count-label"]))
if video_count
video_count = video_count.content
if video_count == "50+"
author = "YouTube"
author_id = "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
end
video_count = video_count.gsub(/\D/, "").to_i?
end
video_count ||= 0
videos = [] of SearchPlaylistVideo
node.xpath_nodes(%q(.//*[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video|
anchor = video.xpath_node(%q(.//a))
if anchor
video_title = anchor.content.strip
id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"]
end
video_title ||= ""
id ||= ""
anchor = video.xpath_node(%q(.//span/span))
if anchor
length_seconds = decode_length_seconds(anchor.content)
end
length_seconds ||= 0
videos << SearchPlaylistVideo.new(
video_title,
id,
length_seconds
)
end
playlist_thumbnail = node.xpath_node(%q(.//span/img)).try &.["data-thumb"]?
playlist_thumbnail ||= node.xpath_node(%q(.//span/img)).try &.["src"]
items << SearchPlaylist.new(
title: title,
id: plid,
author: author,
ucid: author_id,
video_count: video_count,
videos: videos,
thumbnail: playlist_thumbnail
)
when .includes? "yt-lockup-channel"
author = title.strip
ucid = node.xpath_node(%q(.//button[contains(@class, "yt-uix-subscription-button")])).try &.["data-channel-external-id"]?
ucid ||= id.split("/")[-1]
author_thumbnail = node.xpath_node(%q(.//div/span/img)).try &.["data-thumb"]?
author_thumbnail ||= node.xpath_node(%q(.//div/span/img)).try &.["src"]
if author_thumbnail
author_thumbnail = URI.parse(author_thumbnail)
author_thumbnail.scheme = "https"
author_thumbnail = author_thumbnail.to_s
end
author_thumbnail ||= ""
subscriber_count = node.xpath_node(%q(.//span[contains(@class, "subscriber-count")]))
.try &.["title"].try { |text| short_text_to_number(text) } || 0
video_count = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li)).try &.content.split(" ")[0].gsub(/\D/, "").to_i?
items << SearchChannel.new(
author: author,
ucid: ucid,
author_thumbnail: author_thumbnail,
subscriber_count: subscriber_count,
video_count: video_count || 0,
description_html: description_html,
auto_generated: video_count ? false : true,
)
else
id = id.lchop("/watch?v=")
metadata = node.xpath_node(%q(.//div[contains(@class,"yt-lockup-meta")]/ul))
published = metadata.try &.xpath_node(%q(.//li[contains(text(), " ago")])).try { |node| decode_date(node.content.sub(/^[a-zA-Z]+ /, "")) }
published ||= metadata.try &.xpath_node(%q(.//span[@data-timestamp])).try { |node| Time.unix(node["data-timestamp"].to_i64) }
published ||= Time.utc
view_count = metadata.try &.xpath_node(%q(.//li[contains(text(), " views")])).try &.content.gsub(/\D/, "").to_i64?
view_count ||= 0_i64
length_seconds = node.xpath_node(%q(.//span[@class="video-time"])).try { |node| decode_length_seconds(node.content) }
length_seconds ||= -1
live_now = node.xpath_node(%q(.//span[contains(@class, "yt-badge-live")])) ? true : false
premium = node.xpath_node(%q(.//span[text()="Premium"])) ? true : false
if !premium || node.xpath_node(%q(.//span[contains(text(), "Free episode")]))
paid = false
else
paid = true
end
premiere_timestamp = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li/span[@class="localized-date"])).try &.["data-timestamp"]?.try &.to_i64?
if premiere_timestamp
premiere_timestamp = Time.unix(premiere_timestamp)
end
items << SearchVideo.new(
title: title,
id: id,
author: author,
ucid: author_id,
published: published,
views: view_count,
description_html: description_html,
length_seconds: length_seconds,
live_now: live_now,
paid: paid,
premium: premium,
premiere_timestamp: premiere_timestamp
)
end
end
return items
end
def extract_shelf_items(nodeset, ucid = nil, author_name = nil)
items = [] of SearchPlaylist
nodeset.each do |shelf|
shelf_anchor = shelf.xpath_node(%q(.//h2[contains(@class, "branded-page-module-title")]))
next if !shelf_anchor
title = shelf_anchor.xpath_node(%q(.//span[contains(@class, "branded-page-module-title-text")])).try &.content.strip
title ||= ""
id = shelf_anchor.xpath_node(%q(.//a)).try &.["href"]
next if !id
shelf_is_playlist = false
videos = [] of SearchPlaylistVideo
shelf.xpath_nodes(%q(.//ul[contains(@class, "yt-uix-shelfslider-list") or contains(@class, "expanded-shelf-content-list")]/li)).each do |child_node|
type = child_node.xpath_node(%q(./div))
next if !type
case type["class"]
when .includes? "yt-lockup-video"
shelf_is_playlist = true
anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a))
if anchor
video_title = anchor.content.strip
video_id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"]
end
video_title ||= ""
video_id ||= ""
anchor = child_node.xpath_node(%q(.//span[@class="video-time"]))
if anchor
length_seconds = decode_length_seconds(anchor.content)
end
length_seconds ||= 0
videos << SearchPlaylistVideo.new(
video_title,
video_id,
length_seconds
)
when .includes? "yt-lockup-playlist"
anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a))
if anchor
playlist_title = anchor.content.strip
params = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)
plid = params["list"]
end
playlist_title ||= ""
plid ||= ""
playlist_thumbnail = child_node.xpath_node(%q(.//span/img)).try &.["data-thumb"]?
playlist_thumbnail ||= child_node.xpath_node(%q(.//span/img)).try &.["src"]
video_count = child_node.xpath_node(%q(.//span[@class="formatted-video-count-label"]/b)) ||
child_node.xpath_node(%q(.//span[@class="formatted-video-count-label"]))
if video_count
video_count = video_count.content.gsub(/\D/, "").to_i?
end
video_count ||= 50
videos = [] of SearchPlaylistVideo
child_node.xpath_nodes(%q(.//*[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video|
anchor = video.xpath_node(%q(.//a))
if anchor
video_title = anchor.content.strip
id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"]
end
video_title ||= ""
id ||= ""
anchor = video.xpath_node(%q(.//span/span))
if anchor
length_seconds = decode_length_seconds(anchor.content)
end
length_seconds ||= 0
videos << SearchPlaylistVideo.new(
video_title,
id,
length_seconds
)
end
items << SearchPlaylist.new(
title: playlist_title,
id: plid,
author: author_name,
ucid: ucid,
video_count: video_count,
videos: videos,
thumbnail: playlist_thumbnail
)
else
next # Skip
end
end
if shelf_is_playlist
plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"]
items << SearchPlaylist.new(
title: title,
id: plid,
author: author_name,
ucid: ucid,
video_count: videos.size,
videos: videos,
thumbnail: "https://i.ytimg.com/vi/#{videos[0].id}/mqdefault.jpg"
)
end
end
return items
end
def check_enum(db, logger, enum_name, struct_type = nil)
return # TODO
if !db.query_one?("SELECT true FROM pg_type WHERE typname = $1", enum_name, as: Bool)

View File

@ -243,7 +243,8 @@ def channel_search(query, page, channel)
response = YT_POOL.client &.get(url)
initial_data = JSON.parse(response.body).as_a.find &.["response"]?
return 0, [] of SearchItem if !initial_data
items = extract_items(initial_data.as_h)
author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s
items = extract_items(initial_data.as_h, author, ucid)
return items.size, items
end

View File

@ -286,7 +286,7 @@ end
# headers = HTTP::Headers.new
# headers["Cookie"] = env_headers["Cookie"]
#
# html = YT_POOL.client &.get("/view_all_playlists?disable_polymer=1", headers)
# html = YT_POOL.client &.get("/view_all_playlists", headers)
#
# cookies = HTTP::Cookies.from_headers(headers)
# html.cookies.each do |cookie|