FredAnderson
Hi I cleaned the script a bit (much) now it returns all the links correctly.
to run it you need to install with pip
pip install -U beautifulsoup4
and need to create a file called config.toml in the current directory like
frm_login="mylogin"
frm_password="mypassword"
#!/usr/bin/env python3
from urllib.parse import urljoin
import logging
import os
import requests
import sys
import tomllib
from pprint import pprint, pformat
from bs4 import BeautifulSoup
# configure logging
logger = logging.getLogger(os.path.basename(__file__))
logging.basicConfig(level=logging.INFO)
# global variables
CONFIG_FILE = os.path.join(os.getcwd(), "config.toml")
DOMAIN_URL = "https://www.karaoke-version.com"
LOGIN_URL = f"{DOMAIN_URL}/my/login.html"
TARGET_PREFIX = "/custombackingtrack/"
def get_download_page_url(base_url, page):
download_url = f"{base_url}/my/download.html?page={page:d}"
return download_url
def get_number_of_pages(session, domain_url):
# Use a trick if we put a huge number of pages we get the latest page
logger.debug("Opening download page %d", 999)
url = get_download_page_url(domain_url, 999)
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Get all elements with the class 'mr-1'
pagination_elements = soup.select("a.mr-1")
last_page_number = 1
# Check if there are pagination elements
if pagination_elements:
# Get the last element's href attribute
last_page_link = pagination_elements[-1]["href"]
# Extract the page number from the href
last_page_number = int(last_page_link.split("page=")[-1])
return last_page_number
def get_song_urls(session, domain_url, total_pages):
all_song_urls = set() # Using a set to automatically handle duplicates
for page_number in range(1, total_pages + 1):
url = get_download_page_url(domain_url, page_number)
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Find all <a> elements within <td class="my-downloaded-files__song">
song_elements = soup.select("td.my-downloaded-files__song a")
# Extract the href attributes from the <a> elements and construct full URLs
song_urls = [urljoin(domain_url, element["href"]) for element in song_elements]
all_song_urls.update(song_urls) # Update the set with new URLs
# Sort the collected URLs
sorted_song_urls = sorted(all_song_urls)
return sorted_song_urls
def run_main():
logger.info("Load config from %s", CONFIG_FILE)
with open(CONFIG_FILE, "rb") as f:
my_config = tomllib.load(f)
with requests.Session() as session:
# Login
logger.info("Open loging page %s", LOGIN_URL)
post = session.post(LOGIN_URL, data=my_config)
# Check if login was successful
assert post.status_code == 200
logger.info("Login successful. Getting number of pages")
total_pages = get_number_of_pages(session, DOMAIN_URL)
logger.info("Total number of page is %d", total_pages)
sorted_urls = get_song_urls(session, DOMAIN_URL, total_pages)
logger.info("Sorted Song URLs:")
for url in sorted_urls:
print(url)
if __name__ == "__main__":
logger.debug("run main")
sys.exit(run_main())
now it return an ordered list of all the correct links