import requests
import json
import configparser
from urllib.parse import quote
import time
import warnings
warnings.filterwarnings('ignore')

config = configparser.ConfigParser()
config.read('config.ini')

USERNAME = config.get('SPOTIFY', 'username')
PASSWORD = config.get('SPOTIFY', 'password')
CLIENT_ID = config.get('SPOTIFY', 'client_id')
CLIENT_SECRET = config.get('SPOTIFY', 'client_secret')
REDIRECT_URI = config.get("SPOTIFY", 'redirect_uri')
SCOPE = config.get('SPOTIFY', 'scope')

SPOTIFY_AUTH_URL = "https://accounts.spotify.com/authorize"
SPOTIFY_TOKEN_URL = "https://accounts.spotify.com/api/token"
SPOTIFY_API_BASE_URL = "https://api.spotify.com"
API_VERSION = "v1"
SPOTIFY_API_URL = "{}/{}".format(SPOTIFY_API_BASE_URL, API_VERSION)

auth_query_parameters = {
    "response_type":"code",
    "redirect_uri": REDIRECT_URI,
    "scope": SCOPE,
    "client_id": CLIENT_ID,
}

url_args="&".join(["{}={}".format(key, quote(val)) for key, val in auth_query_parameters.items()])
auth_url="{}/?{}".format(SPOTIFY_AUTH_URL, url_args)

print(f"Authorization URL: {auth_url}")

input_url = input("Please paste the URL you were redirected to after authorizing the app: ")

authorization_code = input_url.split("code=")[-1].split("&")[0]
print(f"Authorization code: {authorization_code}")


access_token_request_url = SPOTIFY_TOKEN_URL
access_token_request_data = {
    "grant_type": "authorization_code",
    "code": authorization_code,
    "redirect_uri": REDIRECT_URI,
    "client_id": CLIENT_ID,
    "client_secret": CLIENT_SECRET,
}

print(f"Access token request URL: {access_token_request_url}")
print(f"Access token request data: {access_token_request_data}")

response = requests.post(access_token_request_url, data=access_token_request_data)

response_data = json.loads(response.text)

print(f"Response data: {response_data}")

access_token = response_data["access_token"]
refresh_token = response_data["refresh_token"]
token_type = response_data["token_type"]
expires_in = response_data["expires_in"]

print(f"Access token: {access_token}")

Authorization URL: https://accounts.spotify.com/authorize/?response_type=code&redirect_uri=http%3A//rawcsav.com&scope=user-read-private%20user-read-email%20playlist-modify-private%20ugc-image-upload%20user-library-modify&client_id=a7736df74b5e4fd19b696881709023ba

import pandas as pd
from tqdm import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from pprint import pprint
import requests
import concurrent.futures
import json
import configparser
from urllib.parse import quote
import time

config = configparser.ConfigParser()
config.read('config.ini')

USERNAME = config.get('SPOTIFY', 'username')
PASSWORD = config.get('SPOTIFY', 'password')
CLIENT_ID = config.get('SPOTIFY', 'client_id')
CLIENT_SECRET = config.get('SPOTIFY', 'client_secret')
REDIRECT_URI = config.get('SPOTIFY', 'redirect_uri')
SCOPE = config.get('SPOTIFY', 'scope')
SPOTIFY_API_KEY = config.get('SPOTIFY','spotify_api_key')

SPOTIFY_AUTH_URL = "https://accounts.spotify.com/authorize"
SPOTIFY_TOKEN_URL = "https://accounts.spotify.com/api/token"
SPOTIFY_API_BASE_URL = "https://api.spotify.com"
API_VERSION = "v1"
SPOTIFY_API_URL = "{}/{}".format(SPOTIFY_API_BASE_URL, API_VERSION)

client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

uri = 'spotify:playlist:662StmFnG83CfpdygiWmeJ'
playlist_id = '662StmFnG83CfpdygiWmeJ'

import pandas as pd

results = sp.playlist_tracks(playlist_id)
tracks = results['items']
while results['next']:
    results = sp.next(results)
    tracks.extend(results['items'])

columns = ['title', 'release_date', 'artists', 'album', 'popularity', 'genres', 'duration', 'explicit', 'track_id', 'artist_ids']
rows = []

for track in tqdm(tracks):
    artist_ids = []
    if track['is_local'] or track['track'] is None or track['track']['is_local']:
        continue
    track_name = track['track']['name']
    release_date = track['track']['album']['release_date']
    artists = ', '.join([artist['name'] for artist in track['track']['artists']])
    album = track['track']['album']['name']
    popularity = track['track']['popularity']
    duration = track['track']['duration_ms']
    explicit = track['track']['explicit']
    track_id = track['track']['id']
    for artist in track['track']["artists"]:
        artist_ids.append(artist["id"])
    rows.append({'title': track_name, 'release_date': release_date, 'artists': artists, 'album': album, 'popularity': popularity, 'duration': duration, 'explicit': explicit, 'track_id': track_id, 'artist_ids': artist_ids,})

df = pd.DataFrame(rows, columns=columns)
df.info()

100%|███████████████████████████████████| 1949/1949 [00:00<00:00, 209634.53it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   title         1756 non-null   object
 1   release_date  1756 non-null   object
 2   artists       1756 non-null   object
 3   album         1756 non-null   object
 4   popularity    1756 non-null   int64
 5   genres        0 non-null      float64
 6   duration      1756 non-null   int64
 7   explicit      1756 non-null   bool
 8   track_id      1756 non-null   object
 9   artist_ids    1756 non-null   object
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 125.3+ KB

ids = df['track_id'].tolist()
index = 0
audio_features = []
while index < len(ids):
    audio_features += sp.audio_features(ids[index:index + 50])
    index += 50
features_list = []
for idx, features in enumerate(audio_features):
    features_list.append([
                ids[idx],
                features['energy'], features['liveness'],
                features['tempo'], features['speechiness'],
                features['acousticness'], features['instrumentalness'],
                features['time_signature'], features['danceability'],
                features['key'],
                features['loudness'], features['valence'],
                features['mode']])

features_df = pd.DataFrame(features_list, columns=['track_id','energy', 'liveness',
                                              'tempo', 'speechiness',
                                              'acousticness', 'instrumentalness',
                                              'time_signature', 'danceability',
                                              'key', 'loudness',
                                              'valence', 'mode',])
df = df.merge(features_df, on='track_id')
df.sample(5)

from tqdm import tqdm
import pandas as pd


genres = []
BATCH_SIZE = 50

for artist_id_list in tqdm(df['artist_ids'], desc="Processing artist IDs"):
    artist_genres = []

    for i in range(0, len(artist_id_list), BATCH_SIZE):
        batch_artist_ids = artist_id_list[i:i + BATCH_SIZE]
        artist_results = sp.artists(batch_artist_ids)['artists']

        for artist in artist_results:
            artist_genres.extend(artist['genres'])

    genres.append(', '.join(artist_genres))

df['genres'] = genres
df.sample(5)

Processing artist IDs: 100%|████████████████| 1756/1756 [05:24<00:00,  5.40it/s]

import re
import csv
import pandas as pd
from lyricsgenius import Genius
import configparser
import musixmatch
from fuzzywuzzy import fuzz
from tqdm import tqdm
import requests
import concurrent.futures


config = configparser.ConfigParser()
config.read('config.ini')
GENIUS_API_KEY = config.get("LYRICS", "GENIUS_API_KEY")
MUSIX_API_KEY = config.get("LYRICS", "MUSIX_API_KEY")
genius = Genius(GENIUS_API_KEY, verbose=False, retries=5, timeout=10)

lyrics_dict = {}

def remove_parentheses(text):
    if not isinstance(text, str):
        return ''
    return re.sub(r'\([^)]*\)', '', text).strip()

def get_lyrics(row):
    title = remove_parentheses(row["title"])
    all_artists = [artist.strip() for artist in row["artists"].split(",")] if isinstance(row["artists"], str) else []

    for artist in all_artists:
        if (title, artist) in lyrics_dict:
            return lyrics_dict[(title, artist)]

        song = genius.search_song(title, artist)
        if song is not None and len(song.lyrics) >= 20:
            title_ratio = fuzz.token_set_ratio(title, song.title)

            genius_artists = [genius_artist.strip() for genius_artist in song.artist.split(",")]

            artist_matched = False
            for genius_artist in genius_artists:
                artist_ratio = fuzz.token_set_ratio(artist, genius_artist)
                if artist_ratio >= 70:
                    artist_matched = True
                    break

            if title_ratio >= 70 and artist_matched:
                lyrics_dict[(title, artist)] = song.lyrics
                return song.lyrics

    for artist in all_artists:
        if (title, artist) in lyrics_dict:
            return lyrics_dict[(title, artist)]

        response = requests.get(f"https://api.musixmatch.com/ws/1.1/matcher.lyrics.get?q_track={title}&q_artist={artist}&apikey={MUSIX_API_KEY}")
        data = response.json()

        if data["message"]["header"]["status_code"] == 200:
            lyrics = data["message"]["body"]["lyrics"]["lyrics_body"]
            if len(lyrics) > 0:
                lyrics_dict[(title, artist)] = lyrics
                return lyrics

    print(f"No lyrics found for {title} by {', '.join(all_artists)}")
    lyrics_dict[(title, artist)] = "lyrics not found"
    return "lyrics not found"

def process_batch(batch):
    batch["lyrics"] = batch.apply(get_lyrics, axis=1)
    return batch

def sanitize_text(text):
    if not isinstance(text, str):
        return text

    text = re.sub(r" - .*", "", text)
    text = re.sub(r" Remastered.*", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\(.*?\)", "", text)
    text = re.sub(r"(?i)\b(ft|feat(?:\.|uring)?)(?:.*)", "", text)
    text = re.sub(r'[^a-zA-Z0-9 \n]', '', text)

    return text.strip()

def check_song_match(song, title, artists):
    if song is not None and len(song.lyrics) >= 20:
        title_ratio = fuzz.token_set_ratio(title, song.title)
        genius_artists = [genius_artist.strip() for genius_artist in song.artist.split(",")]

        max_individual_artist_ratio = max(
            fuzz.token_set_ratio(artist, genius_artist)
            for artist, genius_artist in product(artists, genius_artists)
        )
        combined_artist_ratio = fuzz.token_set_ratio(", ".join(artists), ", ".join(genius_artists))

        if title_ratio >= 70 and (max_individual_artist_ratio >= 70 or combined_artist_ratio >= 70):
            return True
    return False

def search_song(title, artists):
    song = genius.search_song(title, ", ".join(artists))

    if not check_song_match(song, title, artists) and len(artists) > 1:
        for artist in artists:
            song = genius.search_song(title, artist)
            if check_song_match(song, title, [artist]):
                break

    return song

def get_lyrics_for_row(row):
    title = row['title']
    artists = row['artists'].split(', ')
    lyrics = row['lyrics']

    if pd.isnull(lyrics) or lyrics == "lyrics not found" or len(lyrics.split()) < 30 or lyrics.isdigit():
        song = search_song(title, artists)
        if check_song_match(song, title, artists):
            return song.lyrics
        else:
            return "lyrics not found"

    return lyrics

def get_lyrics_batch(df):
    return df.apply(get_lyrics_for_row, axis=1)

def get_lyrics_batch_with_sanitization(df):
    def get_lyrics_for_row_with_sanitization(row):
        title = sanitize_text(row['title'])
        artists = [sanitize_text(artist) for artist in row['artists'].split(', ')]
        lyrics = row['lyrics']

        if pd.isnull(lyrics) or lyrics == "lyrics not found" or len(lyrics.split()) < 30 or lyrics.isdigit():
            song = search_song(title, artists)
            if check_song_match(song, title, artists):
                return song.lyrics
            else:
                return "lyrics not found"

        return lyrics

    return df.apply(get_lyrics_for_row_with_sanitization, axis=1)

import re
import pandas as pd
from itertools import product
from fuzzywuzzy import fuzz
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

batch_size = 20
batches = [df[i:i+batch_size] for i in range(0, len(df), batch_size)]

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_batch, batches), total=len(batches)))

df = pd.concat(results)

batch_size = 20
n_batches = int(len(df) / batch_size) + 1

with tqdm(total=n_batches) as pbar:
    with ThreadPoolExecutor() as executor:
        for i in range(n_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            df.loc[start:end, 'lyrics'] = executor.submit(get_lyrics_batch, df.loc[start:end]).result()
            pbar.update(1)

batch_size = 20
n_batches = int(len(df) / batch_size) + 1

with tqdm(total=n_batches) as pbar:
    with ThreadPoolExecutor() as executor:
        for i in range(n_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            missing_lyrics_rows = df.loc[start:end][df.loc[start:end, 'lyrics'].isna() | (df.loc[start:end, 'lyrics'].str.len() < 20) | (df.loc[start:end, 'lyrics'].str.isdigit())]
            new_lyrics = executor.submit(get_lyrics_batch_with_sanitization, missing_lyrics_rows).result()
            missing_lyrics_rows.index = new_lyrics.index
            df.loc[missing_lyrics_rows.index, 'lyrics'] = new_lyrics
            pbar.update(1)

  0%|                                                    | 0/88 [00:00<?, ?it/s]

No lyrics found for Echoes in My Mind by Spooky Black, Wiccaphase
No lyrics found for VENGEANCE | VENGEANCE [FEAT. JPEGMAFIA & ZILLAKAMI | JPEGMAF1A + Z1LLAKAM1] by Denzel Curry, JPEGMAFIA, ZillaKami
No lyrics found for Aahhyeahh by Oliver Francis
No lyrics found for Время Ток by Antoha MC
No lyrics found for Kill Yourself by $uicideboy$
No lyrics found for Soul Doubt by $uicideboy$
No lyrics found for You're Now Tuning Into 66.6 FM With DJ Rapture by $uicideboy$
No lyrics found for Praise The Lord by A$AP Rocky, Skepta
No lyrics found for If You Were to Get What You Deserve, You Would Know What the Bottom of a Tire Tastes Like by $uicideboy$
No lyrics found for Bags by $uicideboy$
No lyrics found for SIRENS | Z1RENZ [FEAT. J.I.D | J.1.D] by Denzel Curry, JID
No lyrics found for Benz Truck by Lil Peep
No lyrics found for Freewave Freestyle 4 by LUCKI

  1%|▍                                         | 1/88 [01:12<1:44:40, 72.19s/it]

No lyrics found for Big Glock Freestyle by Smgsoulja
No lyrics found for WISH FEAT. KIDDO MARV by Denzel Curry, Kiddo Marv
No lyrics found for Suicidal Thoughts - 2005 Remaster by The Notorious B.I.G.
No lyrics found for Way2geekd by Malik
No lyrics found for Pursuit Of Happiness by Kid Cudi, MGMT, Ratatat
No lyrics found for Tiktok by Spliflort

 15%|██████▎                                    | 13/88 [01:50<08:41,  6.96s/it]

No lyrics found for Bean  [feat. Chief Keef] by Lil Uzi Vert, Chief Keef

 18%|███████▊                                   | 16/88 [01:51<06:22,  5.31s/it]

No lyrics found for I <3 My Choppa by Tay-K

 22%|█████████▎                                 | 19/88 [01:56<04:43,  4.11s/it]

No lyrics found for Treehouse Alex G by Jacob Boatsman
No lyrics found for Teck Deck by Pear Lindsay
No lyrics found for Heaven Knows I'm Miserable Now - 2011 Remaster by The Smiths
No lyrics found for Bigmouth Strikes Again - 2017 Master by The Smiths
No lyrics found for Sleep Next to Me by A L E X
No lyrics found for Nowhere Fast - 2011 Remaster by The Smiths
No lyrics found for Pretty Girls Make Graves - 2011 Remaster by The Smiths
No lyrics found for Riot! by Earl Sweatshirt
No lyrics found for Lord You're Okay by A L E X

excluded_genres =["techno", "edm", "electronic dance music", "rave", "house",
        "electronica", "ambient", "gabba", "dubstep", "synthwave", "experimental techno"]

def should_skip_song(song, excluded_genres, min_lyrics_length=20):
    if any(genre in song['genres'] for genre in excluded_genres):
        return True
    if len(song['lyrics']) >= min_lyrics_length:
        return True
    return False

def generate_google_search_url(song_title, artist_name):
    query = f"{song_title} {artist_name} lyrics"
    encoded_query = requests.utils.quote(query)
    return f"https://www.google.com/search?q={encoded_query}"

for idx, song in df.iterrows():
    if not should_skip_song(song, excluded_genres):
        google_search_url = generate_google_search_url(song['title'], song['artists'])
        print(f"Google search URL for '{song['title']}' by {song['artists']}' lyrics: {google_search_url}")
        print(f"Input lyrics for '{song['title']}' by {song['artists']}' (Press enter without entering any text to skip this song):")
        user_input = input().strip()

        if user_input.lower() != 'skip' and user_input != '':
            df.at[idx, 'lyrics'] = user_input

Google search URL for 'Soul Doubt' by $uicideboy$' lyrics: https://www.google.com/search?q=Soul%20Doubt%20%24uicideboy%24%20lyrics
Input lyrics for 'Soul Doubt' by $uicideboy$' (Press enter without entering any text to skip this song):
[Lil No Flash:] Sold out shows, but I don't believe in souls So I doubt these shows are going to leave me feeling whole A couple blunts I blow and now I'm idolized, a role model That'll get 'em to get me to model clothes I'd rather snort up roxy's 'til that dope bottle rattles Nope, I'm not about to tackle yet another problem I'll have to put this one at the bottom I feel like fucking Sodom and Gomorrah, but life is sweet Another black petal falling down right at my fucking feet Another flower rotten, a bouquet of efforts, sour scents are haunting my defeat Sold out funeral, no live nation fee Captivate 100, 000 people, still me and the reaper me 100, 000 want to meet me, I hope they'll let me be  [Lil Half Cut:] I'm able to paint a picture most people can't even see Basquiat mixed with Monet when that herron in me Self-critic that's speaking in cryptic, defying the laws of physics Let me be specific, sadistic, Mr. Pessimistic $lickity $loth, the motherfucker they call the Anti-Christ Used to dream of fans chanting, screaming that "$uicide" Now I get on Instagram and they're posting my personal life I promise it's not what it seems Climb up just to fall down a stream Drag me to the river The richer I get, the worse my liver gets No strippers on my zipper Still in my room, keep it dark as I can Remember scraping up change just for cigarettes Immortalized $uicide, no, they won't forget
Google search URL for 'Praise The Lord (Da Shine) (feat. Skepta)' by A$AP Rocky, Skepta' lyrics: https://www.google.com/search?q=Praise%20The%20Lord%20%28Da%20Shine%29%20%28feat.%20Skepta%29%20A%24AP%20Rocky%2C%20Skepta%20lyrics
Input lyrics for 'Praise The Lord (Da Shine) (feat. Skepta)' by A$AP Rocky, Skepta' (Press enter without entering any text to skip this song):

KeyboardInterrupt

import pandas as pd
df = pd.read_csv("features_df.csv")
df.fillna("", inplace=True)

import os
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import tiktoken
import string
import numpy as np
import openai
import pandas as pd
from tqdm import tqdm
import pickle
import concurrent.futures
import tenacity
import tiktoken
from tqdm.auto import tqdm
import configparser

config = configparser.ConfigParser()
config.read('config.ini')

openai.api_key = config.get("OPENAI", "OPENAI_API_KEY")
COMPLETIONS_MODEL = "gpt-4"
EMBEDDING_MODEL = "text-embedding-ada-002"


stemmer = SnowballStemmer("english")

stop_words = set(stopwords.words("english"))

tokenizer = tiktoken.get_encoding("cl100k_base")

max_length = 1024
def delete_first_and_last_10_words(text):
    words = text.split()
    return ' '.join(words[10:-10])

def preprocess_text(text):

    if not isinstance(text, str):
        text = str(text)

    text = delete_first_and_last_10_words(text)

    text = re.sub('\n', ' ', text, flags=re.DOTALL)

    text= re.sub('r\n', ' ', text)

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    text = re.sub(r'[^\x00-\x7F]+', '', text)

    tokens = tokenizer.encode(text)

    tokens = [token for token in tokens if token not in stop_words]

    if len(tokens) > 2000:
        tokens = tokens[:3000]

    truncated_text = tokenizer.decode(tokens)

    return truncated_text, len(tokens)


df["lyrics"], df["tokens"] = zip(*df["lyrics"].apply(preprocess_text))
for _, row in df.iterrows():
    if (
        len(row.lyrics.split()) < 20
        or len(row.lyrics) < 40
        or not row.lyrics.strip()
        or row.lyrics.isdigit()
    ):
        df.at[row.name, "lyrics"] = ""
df = df.fillna("")
df = df.drop(df[(df["title"] == "")].index)

import openai
import pandas as pd
import concurrent.futures
from tqdm import tqdm
import warnings
import aiohttp
import asyncio
from tqdm.asyncio import tqdm
import nest_asyncio
import time

nest_asyncio.apply()

warnings.filterwarnings("ignore", category=DeprecationWarning)

async def generate_response(lyrics):
    async with aiohttp.ClientSession() as session:
        response = await openai.ChatCompletion.acreate(
            model='gpt-3.5-turbo',
        messages=[
            {"role": "system", "content": "You are a highly perceptive and insightful AI designed to analyze song lyrics. Using a set of given song lyrics, please provide a list of 20-30 descriptive words or phrases that encapsulate the themes and emotions conveyed in the song. The more specific and unique the words and phrases, the better."},
            {"role": "user", "content": "Using these song lyrics : 'Yo, Pi'erre, you wanna come out here? You want water or? What do you want? Just water Just water? Okay, alright, I'll get it right now Thank you Okay Damn, Pi'erre, where'd you find this? Ayy, baby, give me migraine Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange Wait, ayy, baby, give me migraine Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange Yeah, I know I hurt you Yeah, my weed is purple Yeah, I'm goin' in like curfew Still in your - like turtle She forgot all about me, psych You love me, right? I go deep, right? Runnin' routes, wide receiver Your -, I need her Fu- that - in my sneakers She love my father's features Swiper no swipe, no Visa Dora the Explorer, the world, I seen it Rich -, find my cleaners Hollywood -, - I don't need it God got me, I guarantee it Roll my blunt, I see no evil Y'all - still be fightin' y'all demons Cold ass world, man, so anemic Lame ass - still shop at Neimans These pants I got on, ten G, yeah Warren Lotas, that's cap, you see it All these famous people wanna be us (the number one rated mixtape of all time) Wait, ayy, baby, give me migraine Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange Wait, ayy, baby, give me migraine (yo, Pi'erre, you wanna come out here?) Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange Not used to buyin' the whole thing Tag around the world like I paint Know your type like font names Fallin' back like time change Secure the game, yeah, crime pays Get dollar signs, I don't bang Got a million in my bank Family call me wantin' things she think I don't care no more But look how much her mind changed I just flew you out to me Baby, these are my wings Love my -, don't expect a thing Me and you want a wedding cake Uber Eats, I'm on my way Open up like your safe Wait, ayy, baby, give me migraine Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange Wait, ayy, baby, give me migraine Baby, give me my space, pull up, yeah, to my place Show me like it's my estate, she want me 'cause my name Rubber on, so I'm safe, livin' life is so strange All alone at the VMA, shout-out boomin', free the gang All alone at the VMA, shout-out boomin', free the gang All alone at the VMA, shout-out boomin', free the gang All alone at the VMA, shout-out boomin', free the gang Yo Pi'erre, you wanna come out here? Congratulations, you've won two free tickets to Soss Island Bring you and a friend The sossiest adventure you'll ever go on' --- Please provide a detailed and encompassing list of 5-10 words to capture the emotional tone/thematic content/overall essence of these song lyrics.  Avoid generalizations and literalness, and be specific to the song's content."},
            {"role": "assistant", "content": "Hedonism, Bravado, Disorientation, Escapism, Ambivalence, Fame-seeking, Materialism, Insecurity, Resilience"},
            {"role": "user", "content": f"Great job so far, that was good, now using these song lyrics: '{lyrics}' --- Please provide a detailed and encompassing list of 5-10 words to capture the emotional tone/thematic content/overall essence of these song lyrics.  Avoid generalizations and literalness, and be specific to the song's content."}],
        temperature=1.0,
        top_p=0.7,
        max_tokens=50,
    )
    return response["choices"][0]["message"]["content"]


async def process_batch(batch):
    descriptors = []
    for lyrics in batch["lyrics"]:
        if lyrics.strip() == "":
            descriptors.append(None)
        else:
            descriptors.append(await generate_response(lyrics))
    return pd.Series(descriptors)

async def process_with_progress_bar(batch, progress):
    result = await process_batch(batch)
    progress.update(1)
    await asyncio.sleep(.5)
    return result

async def main():
    with tqdm(total=len(batches)) as progress:
        coroutines = [process_with_progress_bar(batch, progress) for batch in batches]
        results = {}

        for index, future in enumerate(asyncio.as_completed(coroutines)):
            result = await future
            results[index] = result
            progress.update(1)

    descriptors_series = pd.Series(results).sort_index().explode()

    df["descriptors"] = descriptors_series.values

async def process_filtered_rows():
    filtered_df = df.loc[(df['descriptors'].isnull()) & (df['lyrics'].str.strip() != "")]

    filtered_batches = [filtered_df.iloc[i:i+batch_size] for i in range(0, len(filtered_df), batch_size)]

    with tqdm(total=len(filtered_batches)) as progress:
        coroutines = [process_with_progress_bar(batch, progress) for batch in filtered_batches]
        results = {}

        for index, future in enumerate(asyncio.as_completed(coroutines)):
            result = await future
            results[filtered_batches[index].index[0]] = result
            progress.update(1)

    for index, descriptors in results.items():
        df.loc[index, 'descriptors'] = descriptors.values[0]

    df.loc[df['lyrics'].str.strip() == "", 'descriptors'] = None

await main()

await process_filtered_rows()

df.head()

3500it [00:09, 370.34it/s]
210it [00:05, 40.80it/s]

df = pd.read_csv("descriptors.csv")
df.fillna("", inplace=True)
numeric_features = ["title", "popularity", "energy", "liveness", "tempo", "speechiness", "acousticness", "instrumentalness", "time_signature", "danceability", "key", "loudness", "valence", "mode"]
df_f = df[numeric_features]
df_f.set_index("title", inplace=True)
df_f.describe()

scaler = StandardScaler()
test_scaled = scaler.fit_transform(df_f)

test_df = pd.DataFrame(test_scaled, columns = df_f.columns)

corr_matrix = test_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans



features = ['popularity', 'energy', 'liveness', 'tempo', 'speechiness',
            'acousticness', 'instrumentalness', 'time_signature', 'danceability',
            'key', 'loudness', 'valence', 'mode']

kmeanst = KMeans(n_clusters=15, random_state=42)

kmeanst.fit(test_df)

clusterstest = kmeanst.labels_


test_df['cluster_labels'] = clusterstest

X = test_df.drop('cluster_labels', axis=1)
y = test_df['cluster_labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)

clf.fit(X_train, y_train)

importances = pd.DataFrame(list(zip(features, clf.feature_importances_)), columns=['Feature', 'Importance'])
print("\nFeature Importances:\n", importances)

Feature Importances:
              Feature  Importance
0         popularity    0.111938
1             energy    0.077952
2           liveness    0.101039
3              tempo    0.126994
4        speechiness    0.074720
5       acousticness    0.084543
6   instrumentalness    0.080836
7     time_signature    0.031801
8       danceability    0.047845
9                key    0.051287
10          loudness    0.076194
11           valence    0.069448
12              mode    0.065403

from statsmodels.stats.outliers_influence import variance_inflation_factor
test_df.drop(columns=["cluster_labels"], inplace=True)
vif = test_df
vif_data = pd.DataFrame()
vif_data["feature"] = test_df.columns
vif_data["VIF"] = [variance_inflation_factor(test_df.values, i)
for i in range(len(vif.columns))]
print(vif_data)

             feature       VIF
0         popularity  1.023021
1             energy  2.903901
2           liveness  1.149133
3              tempo  1.029727
4        speechiness  1.133861
5       acousticness  1.291507
6   instrumentalness  1.186506
7     time_signature  1.037909
8       danceability  1.308180
9                key  1.098585
10          loudness  2.483196
11           valence  1.308944
12              mode  1.093684

columns_to_remove=["liveness", "popularity", "loudness", "speechiness", "key", "mode", "time_signature"]
data = df_f.drop(columns_to_remove, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1750 entries, O Pana! to Strike (Holster)
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   energy            1750 non-null   float64
 1   tempo             1750 non-null   float64
 2   acousticness      1750 non-null   float64
 3   instrumentalness  1750 non-null   float64
 4   danceability      1750 non-null   float64
 5   valence           1750 non-null   float64
dtypes: float64(6)
memory usage: 95.7+ KB

data = df_f.drop(columns_to_remove, axis=1)
num_features = data.select_dtypes(include=[np.number]).columns
figure, axes = plt.subplots(len(num_features), 2, figsize=(15, 5 * len(num_features)))

i = 0
for feature in num_features:
    sns.histplot(data[feature], kde=False, ax=axes[i][0], bins=30)
    axes[i][0].set_title(f'Histogram of {feature}')
    sns.kdeplot(data[feature], fill=True, ax=axes[i][1])
    axes[i][1].set_title(f'Density Plot of {feature}')
    i += 1

plt.tight_layout()
plt.show()

print("Skewness Values: ")
print(data.skew())

Skewness Values:
energy             -0.189202
tempo              -0.079355
acousticness        1.116217
instrumentalness    3.041488
danceability       -0.569501
valence             0.291514
dtype: float64

from scipy.stats import boxcox
data = df_f.drop(columns_to_remove, axis=1)



right_skewed_columns = ['acousticness', 'instrumentalness']
for col in right_skewed_columns:
    if data[col].nunique() > 1:
        data[col], _ = boxcox(data[col] + 1)

left_skewed_columns = ['danceability']
for col in left_skewed_columns:
    if data[col].nunique() > 1:
        data[col] = np.square(data[col])

columns_to_plot = right_skewed_columns + left_skewed_columns
n_cols = len(columns_to_plot)

fig, axes = plt.subplots(nrows=2, ncols=n_cols, figsize=(4 * n_cols, 8))

for i, col in enumerate(columns_to_plot):
    sns.kdeplot(data_original[col], ax=axes[0, i], fill=True)
    axes[0, i].set_title(f'Original {col}')
    sns.kdeplot(data[col], ax=axes[1, i], fill=True)
    axes[1, i].set_title(f'Transformed {col}')

plt.tight_layout()
plt.show()
print("Skewness Values: ")
print(data.skew())

Skewness Values:
energy             -0.189202
tempo              -0.079355
acousticness        0.311325
instrumentalness    1.856363
danceability        0.046399
valence             0.291514
dtype: float64

data = df_f.drop(columns_to_remove, axis=1)

data['instrumentalness_log'] = np.log1p(data['instrumentalness'])

bins = [0, 0.001, 0.01, 0.1, 0.5, 1]
labels = ['0-0.001', '0.001-0.01', '0.01-0.1', '0.1-0.5', '0.5-1']
data['instrumentalness_binned'] = pd.cut(data['instrumentalness'], bins=bins, labels=labels)

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

sns.kdeplot(data_original['instrumentalness'], ax=axes[0], fill=True)
axes[0].set_title('Original Instrumentalness')

sns.kdeplot(data['instrumentalness_log'], ax=axes[1], fill=True)
axes[1].set_title('Log-transformed Instrumentalness')

sns.countplot(data=data, x='instrumentalness_binned', ax=axes[2])
axes[2].set_title('Binned Instrumentalness')

plt.tight_layout()
plt.show()

from sklearn.preprocessing import QuantileTransformer
data = df_f.drop(columns_to_remove, axis=1)

qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=42)
data['instrumentalness_qt'] = qt.fit_transform(data[['instrumentalness']])

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

sns.kdeplot(data_original['instrumentalness'], ax=axes[0], shade=True)
axes[0].set_title('Original Instrumentalness')

sns.kdeplot(data['instrumentalness_qt'], ax=axes[1], shade=True)
axes[1].set_title('Quantile-transformed Instrumentalness')

plt.tight_layout()
plt.show()

data = df_f.drop(columns_to_remove, axis=1)
qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=42)
data['instrumentalness'] = qt.fit_transform(data[['instrumentalness']])
right_skewed_columns = ['acousticness']
for col in right_skewed_columns:
    if data[col].nunique() > 1:
        data[col], _ = boxcox(data[col] + 1)

left_skewed_columns = ['danceability']
for col in left_skewed_columns:
    if data[col].nunique() > 1:
        data[col] = np.square(data[col])
data.skew()

energy             -0.189202
tempo              -0.079355
acousticness        0.311325
instrumentalness    0.001502
danceability        0.046399
valence             0.291514
dtype: float64

from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from scipy.stats import boxcox

df = pd.read_csv("descriptors.csv")
df.fillna("", inplace=True)

scaler = MinMaxScaler()

df['danceability_normalized'] = scaler.fit_transform(df[['danceability']])
df['energy_normalized'] = scaler.fit_transform(df[['energy']])
df['acousticness_normalized'] = scaler.fit_transform(df[['acousticness']])
df['instrumentalness_normalized'] = scaler.fit_transform(df[['instrumentalness']])
df['valence_normalized'] = scaler.fit_transform(df[['valence']])
df['tempo_normalized'] = scaler.fit_transform(df[['tempo']])

import random
def assign_danceability(value):
    not_suitable = [
        'Not suitable for dancing',
        'Terrible for dancing',
        'No dance vibes',
        'Not engaging at all',
        'Barely entertaining',
        'Lacks energy and rhythm',
        'Not recommended for fun activities'
    ]
    slightly_suitable = [
        'Slightly suitable for dancing',
        'A hint of danceability',
        'Could maybe dance to this',
        'Mildly engaging',
        'Somewhat entertaining',
        'A touch of rhythm',
        'Not bad for casual fun'
    ]
    suitable = [
        'Suitable for dancing',
        'Danceable',
        'Move to the groove',
        'Engaging and entertaining',
        'Good for a variety of activities',
        'Catchy rhythm and beat',
        'Fun for parties and gatherings'
    ]
    highly_suitable = [
        'Highly suitable for dancing',
        'Perfect for dancing',
        'Dance floor magnet',
        'Extremely engaging',
        'Highly entertaining',
        'Ideal for various activities',
        'A must-have for any party playlist'
    ]
    cant_stop_moving = [
        'Dance party essential',
        "Can't stop moving",
        'Irresistible dance moves',
        'Incredibly captivating',
        'Unbelievable entertainment',
        'Perfect for any energetic activity',
        'Impossible to sit still'
    ]
    if value <= 0.2:
        return not_suitable[index]
    elif value <= 0.4:
        return slightly_suitable[index]
    elif value <= 0.6:
        return suitable[index]
    elif value <= 0.8:
        return highly_suitable[index]
    else:
        return cant_stop_moving[index]


def assign_energy(value):
    sloth_like = [
        'Sluggish energy','Sloth-like', 'Laid-back energy', 'Super relaxed', 'Lazy vibes', 'Couch potato mode',
         'Doze-inducing', 'Mellow to the core', 'Zero rush'
    ]

    chill_vibes = [
        'Calm energy', 'Chill vibes', 'Easygoing energy', 'Cool and calm', 'Breezy presence', 'Stress-free spirit',
        'Unruffled demeanor', 'Languid essence', 'Serene and tranquil'
    ]

    average_energy = [
        'Average energy', 'Steadily paced', 'Middle-of-the-road energy', 'Neither high nor low energy', 'Neutral tempo',
        'Balanced liveliness', 'Standard vitality', 'Routine dynamism', 'Moderate pep'
    ]

    energetic = [
        'Energetic', 'Excitedly paced', 'Full of life', 'Bouncing with energy', 'Active and animated', 'Vibrant spirit',
        'Sprightly enthusiasm', 'Brisk and lively', 'Fired up', 'Peppy and zippy'
    ]

    vitality_overload = [
        'Explosively energetic','Vitality overload', 'Through the roof energy', 'Turbocharged zest',
        'Supersonic liveliness', 'Hyperactive buzz', 'Intense vigor', 'Frenetic enthusiasm', 'Unstoppable force'
    ]
    if value <= 0.2:
        return sloth_like[index]
    elif value <= 0.4:
        return chill_vibes[index]
    elif value <= 0.6:
        return average_energy[index]
    elif value <= 0.8:
        return energetic[index]
    else:
        return vitality_overload[index]




def assign_acousticness(value):
    high_tech = ['Technologically enhanced beyond acoustic','High-tech', 'Synthetic sounds', 'Electronic vibes', 'Futuristic audio', 'Digitally-enhanced', 'Cutting-edge sonics', 'Modernized melodies', 'Innovative audio experiences', 'Robotic rhythms', 'Sci-fi soundscapes']

    electro_acoustic = ['Deeper fusion of acoustic and digital', 'Electro-acoustic', 'Hybrid sounds', 'Mix of natural and electronic', 'Synergy of organic and synthetic', 'Crossover tones', 'Amalgamated audio', 'Blended soundscapes', 'Melded melodies', 'Technorganic tunes']

    balanced = ['Combination of acoustic and electronic', 'Balanced', 'Well-rounded sound', 'Equilibrium audio', 'Harmonious blend', 'Even mix of sound sources', 'In-sync audio', 'Stable soundscapes', 'Yin-yang of audio', 'Auditory harmony']

    acoustic = ['Pure Acoustic', 'Natural sound', 'Unprocessed and organic', 'Authentic audio', 'Raw resonance', 'Original and unmodified', 'Pure and untouched', 'Straight-from-the-source', 'True-to-life tones', 'Untainted tunes']

    unplugged = ['Bare-bones acoustic', 'Unplugged', 'Stripped-down and raw','Unadorned acoustics', 'Simple and honest sound', 'Essence of the music', 'Back-to-basics', 'Intimate and close-up', 'Straight to the heart sounds']

    if value <= 0.2:
        return high_tech[index]
    elif value <= 0.4:
        return electro_acoustic[index]
    elif value <= 0.6:
        return balanced[index]
    elif value <= 0.8:
        return acoustic[index]
    else:
        return unplugged[index]


def assign_instrumentalness(value):
    vocal_centric = ['Vocal-centric instrumentalness', 'Lyric-focused', 'All about the voice', 'Voice-driven', 'Singing takes the spotlight', 'Primarily vocal', 'Centered on vocals', 'Lyrics steal the show']

    vocal_instrumental_blend = ['Vocal-instrumental blend', 'Nice mix of vocals and instruments', 'Equal focus on voice and instruments', 'Harmonious vocal-instrumental balance', 'Perfect fusion of voice and instruments', 'Vocals and instruments in sync', 'Even emphasis on both vocals and instruments', 'Neck and neck with voice and instruments']

    instrumental_leaning = ['Instrumental-leaning', 'More instrumental than vocal', 'Instruments take center stage', 'Instruments lead the charge', 'Vocals play supporting role', 'Emphasis on the instrumentals', 'Instrument-heavy with a dash of vocals', 'Instruments in the limelight']

    instrumental_dominant = ['Instrumental dominant', 'Mostly instrumental', 'Vocals take a backseat', 'Instruments reign supreme', 'Vocals sparingly used', 'Majority of focus on instrumentals', 'Mainly about the instruments', 'Barely any vocals present']

    completely_instrumental = ['Completely instrumental', 'No vocals', 'Purely instrumental sounds', 'Devoid of vocals', 'Vocals-free', 'All-instrumental performance', 'Solely about instrumentals', 'Without a hint of vocalization']

    if value <= 0.2:
        return vocal_centric[index]
    elif value <= 0.4:
        return vocal_instrumental_blend[index]
    elif value <= 0.6:
        return instrumental_leaning[index]
    elif value <= 0.8:
        return instrumental_dominant[index]
    else:
        return completely_instrumental[index]


    if value <= 0.2:
        return heartbroken[index]
    elif value <= 0.4:
        return blue[index]
    elif value <= 0.6:
        return neutral[index]
    elif value <= 0.8:
        return joyful[index]
    else:
        return ecstatic[index]


def assign_tempo(value):
    molasses_slow = ['Slow tempo', 'Molasses slow', 'Crawling tempo', 'Glacial progress', 'Snail-like pace', 'Dawdling gait', 'Languid motion']
    slow = ['Relaxed', 'Unhurried', 'Relaxed tempo', 'Easygoing stride', 'Measured pace', 'Unruffled cadence', 'Laid-back rhythm']
    moderate = ['Medium tempo', 'Steady pace', 'Medium tempo', 'Balanced tempo', 'Composed gait', 'Even pace', 'Temperate rhythm']
    fast = ['Fast tempo', 'Upbeat', 'Quick tempo', 'Brisk pace', 'Swift stride', 'Nimble rhythm', 'Energetic motion']
    lightning_fast = ['Rapid tempo', 'Rapid-fire pace', 'Breakneck speed', 'Blistering quickness', 'Frenetic haste', 'Meteoric velocity', 'Turbulent acceleration']

    if value <= 0.2:
        return molasses_slow[index]
    elif value <= 0.4:
        return slow[index]
    elif value <= 0.6:
        return moderate[index]
    elif value <= 0.8:
        return fast[index]
    else:
        return lightning_fast[index]


def assign_valence(value):
    worse = ["Devastated"]
    bad = ["Dismayed"]
    neutral = ["Neutral emotions"]
    good = ["Joyful emotions"]
    great =["Ecstatic"]

    if value <= 0.2:
        return worse[index]
    elif value <= 0.4:
        return bad[index]
    elif value <= 0.6:
        return neutral[index]
    elif value <= 0.8:
        return good[index]
    else:
        return great[index]


index=0

df['danceability_normalized'] = df['danceability_normalized'].apply(assign_danceability)
df['energy_normalized'] = df['energy_normalized'].apply(assign_energy)
df['acousticness_normalized'] = df['acousticness_normalized'].apply(assign_acousticness)
df['instrumentalness_normalized'] = df['instrumentalness_normalized'].apply(assign_instrumentalness)
df['valence_normalized'] = df['valence_normalized'].apply(assign_valence)
df['tempo_normalized'] = df['tempo_normalized'].apply(assign_tempo)

import concurrent.futures
import time
import pandas as pd
import openai
import tenacity
from typing import List, Dict, Tuple
from tqdm import tqdm

EMBEDDING_MODEL = "text-embedding-ada-002"


@tenacity.retry(reraise=True, stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(5) + tenacity.wait_exponential(multiplier=2))
def get_embedding(text: str, model: str = EMBEDDING_MODEL, vector_length=1536):
    if not text:
        return [0] * vector_length

    time.sleep(0.1)
    result = openai.Embedding.create(
        model=model,
        input=text,
        output_format="list",
        vector_length=vector_length
    )
    return result["data"][0]["embedding"]

def get_embeddings_concurrently(texts: List[str], batch_size: int = 50) -> List[List[float]]:
    embeddings = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
        for batch in tqdm(batches, total=len(batches)):
            future_to_embedding = {executor.submit(get_embedding, text): text for text in batch}
            for future in concurrent.futures.as_completed(future_to_embedding):
                try:
                    embedding = future.result()
                    embeddings.append(embedding)
                except Exception as e:
                    print(f"Failed to obtain embedding: {e}")
                    embeddings.append(None)
    return embeddings


df_embed = df[["title", "descriptors", "genres", "danceability_normalized", "energy_normalized", "acousticness_normalized", "instrumentalness_normalized", "valence_normalized", "tempo_normalized",]]
df_embed["numerical_features"] = df_embed[["danceability_normalized", "energy_normalized", "acousticness_normalized", "instrumentalness_normalized", "valence_normalized", "tempo_normalized"]].apply(lambda row: ' '.join(row.map(str)), axis=1)

descriptors_embeddings = get_embeddings_concurrently(df_embed["descriptors"].tolist())
genres_embeddings = get_embeddings_concurrently(df_embed["genres"].tolist())
numerical_features_embeddings = get_embeddings_concurrently(df_embed["numerical_features"].tolist())

df_embeddings = pd.DataFrame({
    "title": df["title"],
    "descriptors_embedding": descriptors_embeddings,
    "genres_embedding": genres_embeddings,
    "numerical_features_embedding": numerical_features_embeddings
})

/var/folders/y_/d0wbwn_s1tlbh09zvpsn9bp40000gn/T/ipykernel_61853/360115022.py:44: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_embed["numerical_features"] = df_embed[["danceability_normalized", "energy_normalized", "acousticness_normalized", "instrumentalness_normalized", "valence_normalized", "tempo_normalized"]].apply(lambda row: ' '.join(row.map(str)), axis=1)
100%|███████████████████████████████████████████| 35/35 [01:24<00:00,  2.42s/it]
100%|███████████████████████████████████████████| 35/35 [01:33<00:00,  2.67s/it]
100%|███████████████████████████████████████████| 35/35 [01:34<00:00,  2.71s/it]

import numpy as np

descriptors_mean = np.mean(np.mean(np.vstack(df_embeddings["descriptors_embedding"]), axis=0))
descriptors_std = np.mean(np.std(np.vstack(df_embeddings["descriptors_embedding"]), axis=0))

genres_mean = np.mean(np.mean(np.vstack(df_embeddings["genres_embedding"]), axis=0))
genres_std = np.mean(np.std(np.vstack(df_embeddings["genres_embedding"]), axis=0))

numerical_features_mean = np.mean(np.mean(np.vstack(df_embeddings["numerical_features_embedding"]), axis=0))
numerical_features_std = np.mean(np.std(np.vstack(df_embeddings["numerical_features_embedding"]), axis=0))

print("Descriptors mean and standard deviation:", descriptors_mean, descriptors_std)
print("Genres mean and standard deviation:", genres_mean, genres_std)
print("Numerical features mean and standard deviation:", numerical_features_mean, numerical_features_std)

Descriptors mean and standard deviation: -0.0006624432726102312 0.009985874238851751
Genres mean and standard deviation: -0.000655091559020203 0.010512090589250095
Numerical features mean and standard deviation: -0.0007223709017817755 0.006622184106810654

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from umap.umap_ import UMAP
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
import matplotlib.pyplot as plt

scaler = StandardScaler()
descriptors_scaled = scaler.fit_transform(df_embeddings["descriptors_embedding"].tolist())
genres_scaled = scaler.fit_transform(df_embeddings["genres_embedding"].tolist())
numerical_features_scaled = scaler.fit_transform(df_embeddings["numerical_features_embedding"].tolist())

kmeans = KMeans(n_clusters=20, random_state=42)
descriptors_clusters = kmeans.fit_predict(descriptors_scaled)
genres_clusters = kmeans.fit_predict(genres_scaled)
numerical_features_clusters = kmeans.fit_predict(numerical_features_scaled)

descriptors_silhouette = silhouette_score(descriptors_scaled, descriptors_clusters)
genres_silhouette = silhouette_score(genres_scaled, genres_clusters)
numerical_features_silhouette = silhouette_score(numerical_features_scaled, numerical_features_clusters)

print(f"Descriptors silhouette score: {descriptors_silhouette}")
print(f"Genres silhouette score: {genres_silhouette}")
print(f"Numerical features silhouette score: {numerical_features_silhouette}")

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

Descriptors silhouette score: 0.1068023386365533
Genres silhouette score: 0.26548196731323515
Numerical features silhouette score: 0.1301613527629044

descriptors_weight = .15
genres_weight = .55
numerical_features_weight = .3

def combine_and_weigh_embeddings(row):
    descriptors_embedding = np.array(row["descriptors_embedding"])
    genres_embedding = np.array(row["genres_embedding"])
    numerical_features_embedding = np.array(row["numerical_features_embedding"])

    combined_embedding = (
        descriptors_weight * descriptors_embedding +
        genres_weight * genres_embedding +
        numerical_features_weight * numerical_features_embedding
    )

    return combined_embedding.tolist()

df_embeddings["combined_embedding"] = df_embeddings.apply(combine_and_weigh_embeddings, axis=1)

def plot_clustering(X, clusters, title):
    plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=100)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title(title)
    plt.grid()
    plt.show()

combined_embeddings_list = [embedding for embedding in df_embeddings["combined_embedding"]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(combined_embeddings_list)

selector = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
X_high_variance = selector.fit_transform(X_scaled)
selected_features = selector.get_support()

kmeans = KMeans(n_clusters=20, random_state=42)
clusters_high_variance = kmeans.fit_predict(X_high_variance)
plot_clustering(X_high_variance, clusters_high_variance, 'Clustered data (high variance features)')

correlation_matrix = np.abs(np.corrcoef(X_scaled, rowvar=False))
np.fill_diagonal(correlation_matrix, 0)
max_corr = correlation_matrix.max(axis=1)
mask = max_corr < 0.9

X_low_corr = X_scaled[:, mask]

clusters_low_corr = kmeans.fit_predict(X_low_corr)
plot_clustering(X_low_corr, clusters_low_corr, 'Clustered data (low correlated features)')

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
clusters_pca = kmeans.fit_predict(X_pca)
plot_clustering(X_pca, clusters_pca, 'Clustered data (PCA)')

tsne = TSNE(n_components=2, perplexity =30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
clusters_tsne = kmeans.fit_predict(X_tsne)
plot_clustering(X_tsne, clusters_tsne, 'Clustered data (t-SNE)')

reducer = UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
clusters_UMAP = kmeans.fit_predict(X_umap)
plot_clustering(X_umap, clusters_UMAP, 'Clustered data (UMAP)')

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

datasets = [
    (X_high_variance, clusters_high_variance, "High Variance Features"),
    (X_low_corr, clusters_low_corr, "Low Correlated Features"),
    (X_pca, clusters_pca, "PCA"),
    (X_tsne, clusters_tsne, "t-SNE"),
    (X_umap, clusters_UMAP, "UMAP"),
]

print("Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique")
print("-------------------------------------------------------------------------")
for X_transformed, clusters, technique in datasets:
    silhouette = silhouette_score(X_transformed, clusters)
    davies_bouldin = davies_bouldin_score(X_transformed, clusters)
    calinski_harabasz = calinski_harabasz_score(X_transformed, clusters)
    print(f"{silhouette:.4f}          | {davies_bouldin:.4f}              | {calinski_harabasz:.4f}                | {technique}")

Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique
-------------------------------------------------------------------------
0.1540          | 2.2969              | 70.8264                | High Variance Features
0.1458          | 2.6108              | 69.4565                | Low Correlated Features
0.4325          | 0.7239              | 4233.8896                | PCA
0.5113          | 0.6440              | 3080.0098                | t-SNE
0.6069          | 0.5072              | 5223.4954                | UMAP

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering

selector = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
X_high_variance = selector.fit_transform(X_scaled)
selected_features = selector.get_support()

hierarchical = AgglomerativeClustering(n_clusters=15)
clusters_high_variance = hierarchical.fit_predict(X_high_variance)
plot_clustering(X_high_variance, clusters_high_variance, 'Clustered data (high variance features)')

correlation_matrix = np.abs(np.corrcoef(X_scaled, rowvar=False))
np.fill_diagonal(correlation_matrix, 0)
max_corr = correlation_matrix.max(axis=1)
mask = max_corr < 0.9

X_low_corr = X_scaled[:, mask]

clusters_low_corr = hierarchical.fit_predict(X_low_corr)
plot_clustering(X_low_corr, clusters_low_corr, 'Clustered data (low correlated features)')

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
clusters_pca = hierarchical.fit_predict(X_pca)
plot_clustering(X_pca, clusters_pca, 'Clustered data (PCA)')

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
clusters_tsne = hierarchical.fit_predict(X_tsne)
plot_clustering(X_tsne, clusters_tsne, 'Clustered data (t-SNE)')
df["clusters"] = clusters_tsne

reducer = UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
clusters_UMAP = hierarchical.fit_predict(X_umap)
plot_clustering(X_umap, clusters_UMAP, 'Clustered data (UMAP)')

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

datasets = [
    (X_high_variance, clusters_high_variance, "High Variance Features"),
    (X_low_corr, clusters_low_corr, "Low Correlated Features"),
    (X_pca, clusters_pca, "PCA"),
    (X_tsne, clusters_tsne, "t-SNE"),
    (X_umap, clusters_UMAP, "UMAP"),
]

print("Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique")
print("-------------------------------------------------------------------------")
for X_transformed, clusters, technique in datasets:
    silhouette = silhouette_score(X_transformed, clusters)
    davies_bouldin = davies_bouldin_score(X_transformed, clusters)
    calinski_harabasz = calinski_harabasz_score(X_transformed, clusters)
    print(f"{silhouette:.4f}          | {davies_bouldin:.4f}              | {calinski_harabasz:.4f}                | {technique}")

Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique
-------------------------------------------------------------------------
0.1269          | 2.6050              | 79.2627                | High Variance Features
0.1267          | 2.6061              | 78.6258                | Low Correlated Features
0.4091          | 0.6959              | 3642.5595                | PCA
0.4395          | 0.7607              | 2265.0952                | t-SNE
0.5581          | 0.5556              | 3630.7808                | UMAP

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
import numpy as np



n_clusters = 15
spectral= SpectralClustering(n_clusters=20, affinity='nearest_neighbors')

selector = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
X_high_variance = selector.fit_transform(X_scaled)
selected_features = selector.get_support()

kmeans = KMeans(n_clusters=25, random_state=42)
clusters_high_variance = spectral.fit_predict(X_high_variance)
plot_clustering(X_high_variance, clusters_high_variance, 'Clustered data (high variance features)')

correlation_matrix = np.abs(np.corrcoef(X_scaled, rowvar=False))
np.fill_diagonal(correlation_matrix, 0)
max_corr = correlation_matrix.max(axis=1)
mask = max_corr < 0.9

X_low_corr = X_scaled[:, mask]

clusters_low_corr = spectral.fit_predict(X_low_corr)
plot_clustering(X_low_corr, clusters_low_corr, 'Clustered data (low correlated features)')

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
clusters_pca = spectral.fit_predict(X_pca)
plot_clustering(X_pca, clusters_pca, 'Clustered data (PCA)')

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
clusters_tsne = spectral.fit_predict(X_tsne)
plot_clustering(X_tsne, clusters_tsne, 'Clustered data (t-SNE)')


reducer = UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
clusters_UMAP = spectral.fit_predict(X_umap)
plot_clustering(X_umap, clusters_UMAP, 'Clustered data (UMAP)')

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:274: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:274: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:274: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/manifold/_spectral_embedding.py:274: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn(

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

datasets = [
    (X_high_variance, clusters_high_variance, "High Variance Features"),
    (X_low_corr, clusters_low_corr, "Low Correlated Features"),
    (X_pca, clusters_pca, "PCA"),
    (X_tsne, clusters_tsne, "t-SNE"),
    (X_umap, clusters_UMAP, "UMAP"),
]

print("Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique")
print("-------------------------------------------------------------------------")
for X_transformed, clusters, technique in datasets:
    silhouette = silhouette_score(X_transformed, clusters)
    davies_bouldin = davies_bouldin_score(X_transformed, clusters)
    calinski_harabasz = calinski_harabasz_score(X_transformed, clusters)
    print(f"{silhouette:.4f}          | {davies_bouldin:.4f}              | {calinski_harabasz:.4f}                | {technique}")

Silhouette Score | Davies-Bouldin Index | Calinski-Harabasz Index | Technique
-------------------------------------------------------------------------
0.1143          | 1.9751              | 56.5230                | High Variance Features
0.1058          | 1.8648              | 53.3491                | Low Correlated Features
0.3686          | 0.7487              | 3261.6389                | PCA
0.0972          | 3.4658              | 135.1128                | t-SNE
-0.2681          | 4.9292              | 41.1453                | UMAP

reducer = UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_scaled)
clusters_UMAP = kmeans.fit_predict(X_umap)
plot_clustering(X_umap, clusters_UMAP, 'Clustered data (UMAP)')
df_embeddings["cluster"] = clusters_UMAP

/Users/gavinmason/miniconda3/envs/botify/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

cluster_counts = df_embeddings.groupby("cluster").size().reset_index(name="count")
print(cluster_counts)

    cluster  count
0         0     39
1         1    100
2         2    143
3         3    104
4         4     43
5         5    118
6         6    107
7         7     70
8         8     50
9         9    105
10       10    129
11       11     55
12       12     27
13       13     73
14       14     33
15       15     12
16       16     74
17       17     41
18       18     66
19       19    118
20       20     85
21       21     29
22       22     58
23       23     44
24       24     27

for cluster, group in df_embeddings.groupby("cluster"):
    print(f"Cluster {cluster}:")
    print(group["title"].sample(10))
    print("=====================================")

Cluster 0:
1567                                               Phone
1571                                              Nascar
1684                                            i need u
1593    Folie Á Deux (feat. Westside Gunn & Keisha Plum)
1698                                      100 Round Drum
1742                                                FOLD
1598                                            Drakon !
1695                                        Ease My Pain
1569                                              Lately
1184                                        The God Hour
Name: title, dtype: object
=====================================
Cluster 1:
1632                                       25WAGG3DOU2
304                      Jesus Forgive Me, I Am A Thot
1491                                             On BS
856                                       Fallen Demon
602     Heaven Knows I'm Miserable Now - 2011 Remaster
78                                    Imaginary Places
1590                                        Peppermint
663                                             RENTAL
1626                                            pRETTy
1625                                    The Alchemist.
Name: title, dtype: object
=====================================
Cluster 2:
1486                  BackOutsideBoyz
1065                           Famous
1317    Ghost Rider - 2019 - Remaster
964                        Культ тела
1148                           Doublë
1107                         Im Tupac
570                        Crime Pays
1279                        Fingerbib
1458                       by the law
780                             Tampa
Name: title, dtype: object
=====================================
Cluster 3:
1428                                   Selfish
786                                Summer Rain
1737                           Free Young Thug
1136                                    Palace
401     Gold Thangs & Pinky Rangs (Da Hooptie)
971                                      NO. 9
1714                                     Watch
1402                                        OG
324                        MTOMB (feat. Liv.e)
760                          Where The Hood At
Name: title, dtype: object
=====================================
Cluster 4:
837             I Am Bored
1399      Where da hook go
989           U could tëll
1431                 ACTOR
1110          Buy Whatever
1694            Lunch Meat
998          Say It Again!
993      Gorgeous Gorillaz
1447    passive aggression
990             New Turban
Name: title, dtype: object
=====================================
Cluster 5:
300                             Clay Pigeons
8                                       Dirt
1273                     Darkest Before Dawn
65      4 Gold Chains - (feat. Clams Casino)
715                        Don't You Love Me
167                                   L.M.F.
896                                RIP Young
1651                                Spotless
311                             PROLLY HEARD
1467              Cereal (feat. Kenny Mason)
Name: title, dtype: object
=====================================
Cluster 6:
1390                      Paraquat
314                           Numb
1358                    Fuck a War
175                     Like Water
537                       Magnolia
1120                     My Agenda
1587    Top (feat. Pi'erre Bourne)
1001                         Fukit
285           Freewave Freestyle 4
674                     Out Da Way
Name: title, dtype: object
=====================================
Cluster 7:
941               Choppa
190     Gone (Interlude)
1225            Juvenile
197         Terminal Sex
228            Self Care
115                Oldie
872             Woodlawn
208           Spider Web
113     Meet Mr. NICEGUY
790        As She Dreams
Name: title, dtype: object
=====================================
Cluster 8:
260                              GOSHA
1032                            Faizon
1384                  GEEKED N BLESSED
1385                     BEEN A MINUTE
248                        Almost Back
1365    200ACUP+RIPSMITHY (Evil Giane)
718                             5 Ways
1004                Story of an Artist
279                        Mount Sinai
649                             Geeked
Name: title, dtype: object
=====================================
Cluster 9:
523                                       Magic Arrow
178                                       December 24
1031                                     Miami Ultras
1335                                         Disagree
214                                        Fuck A Hoe
63                                          All I Got
1702                                          Oh Lord
536                                 Rhinestone Cowboy
569     Two Thousand Eighteen, Bye (see page 199–203)
597                                           Calcium
Name: title, dtype: object
=====================================
Cluster 10:
38                        Sunflower
1681                      Luv Drunk
1646                        Careful
290                         BOY BYE
1726                 I See (2 Many)
1618    ATLANTIS (feat. Chief Keef)
1036                     Cover Girl
1630                     Aztec Baby
1570             Geeked Out My Body
1659                 Elfen Lied 222
Name: title, dtype: object
=====================================
Cluster 11:
1735                   BIGBANG (crunchtime)
1422                                Walk In
827     Miss The Rage (feat. Playboi Carti)
1717                             real runtz
1498                            Remember Me
1574                        Sunflower Seeds
1362                                  SHIRT
1424                              Real Lies
1013                             Red Velvet
1436                             what a day
Name: title, dtype: object
=====================================
Cluster 12:
405         BB (BODYBAG)
210                Pemex
1482    SPIT IN MY FACE!
458         Passionfruit
427         MIDDLE CHILD
251         BTSTU - Demo
282           YEAH RIGHT
498     Yung Hustla (2K)
316                GHOST
1490         SHE THE ONE
Name: title, dtype: object
=====================================
Cluster 13:
1568                     Rick Ross
966                Полярная звезда
1492                        Eminem
769     Codeine Tears in Her Fanta
1407                        Switch
1496                         Petco
848                Hotel Breakfast
1510                          Skye
1067                    Raspy Shit
1432                  You Can Feel
Name: title, dtype: object
=====================================
Cluster 14:
245     Aliens Are Ghosts ($UICIDEBOY$ X TRAVIS BARKER)
672                                                adhd
1289                                        Black Cloud
236                                       Star Shopping
240                                  The Mourning After
464                                     Feeling Whitney
343                                     Uber Everywhere
247                                         Chewing Gum
956                  Polo Jeans (feat. Earl Sweatshirt)
224                                    Someone Like You
Name: title, dtype: object
=====================================
Cluster 15:
670       Greed (feat. Lil Yachty)
50                           TOKYO
15                           SWEET
291    Silkk da Shocka (feat. Syd)
12              TheGrandestNothing
52                            MILK
20                            STAR
292        Drugs You Should Try It
17                      white wine
53         Either Hated Or Ignored
Name: title, dtype: object
=====================================
Cluster 16:
1138                 Metallic Intuition
169                              Iceman
164                          Puff Daddy
153                  Southern Hostility
173                        Lil Motorola
1102    Smile #6 (see page 198 and 158)
255                            Titanium
1361                          Can't Cap
209                             BAD KID
675                      Divine Madness
Name: title, dtype: object
=====================================
Cluster 17:
1337                                 A Minute(lone)
701                                    Live Forever
112                          I Miss My Dead Friends
1738                                      McDonalds
1647                                    Dark Knight
889     Strawberry Fields Forever - Remastered 2009
1288                                    Let Me Live
1642                                   How Bout U ?
432                               Hot (feat. Gunna)
1538                                     The Roteks
Name: title, dtype: object
=====================================
Cluster 18:
87                                        EdEddnEddy
55      Ticker Tape (feat. Carly Simon & Kali Uchis)
616                       Only Everyone Can Judge Me
72                                     4th Dimension
1066                                           Honey
241                                         Vitamins
68                                           Leglock
873                  SIR BAUDELAIRE (feat. DJ Drama)
878                                      Hummingbird
501                               Group Autogenics I
Name: title, dtype: object
=====================================
Cluster 19:
836                               Let It Go
812                               Roundhere
604                Guild (feat. Mac Miller)
0                                   O Pana!
801                           Not Facinated
542     Can't Go For That (feat. Lil Duval)
1130                             Thin Flesh
544                                   Sense
807                                    Pill
831                                    ZION
Name: title, dtype: object
=====================================
Cluster 20:
46             I Know How It Ends
1109                   #RR anthem
747                     Gang Unit
735      O'lord! I Have My Doubts
1512                         TuTu
861                Xanny Bars 222
1229    Chuch (Suicide Year Edit)
748                        Freaks
1687                  Out On Bond
1230                     Survival
Name: title, dtype: object
=====================================
Cluster 21:
1312    How Soon Is Now? - 2011 Remaster
617                                Waste
1339                           Rave on U
1349                      Boys Don't Cry
1342                        CRYSTLCSTLES
970                          Sameolemeek
1573                             Be Mine
623                          PARTY IN LA
628                              Get Got
1716     Florida Water (feat. Luh Tyler)
Name: title, dtype: object
=====================================
Cluster 22:
1609           First Light
465      Frick Park Market
1286           Love Reigns
375                   Tuff
507              Accordion
93             Sailor Moon
1691          No morë talk
92      On Melancholy Hill
472        Thru the Screen
370       Deep Sea Tundras
Name: title, dtype: object
=====================================
Cluster 23:
302            Gospel
1582          Jumpin!
1149          Taliban
1029          Min Dag
319               DHL
695          40 Acres
660                OK
1261     Money Fetish
1147         Fazoland
116     Chippi Chippi
Name: title, dtype: object
=====================================
Cluster 24:
206                                 Real Nega
825                          You Motherfucker
186                        Communist Daughter
79                                      Rambo
1267    HARDSTYLE DRILL 2009 NOKIA ANGELZ 1.6
454                                    Flexin
131                        Peach Scone - Live
136                               Montego Bae
313                                       BIG
326                                In My Room
Name: title, dtype: object
=====================================

from collections import Counter
import random

df_embeddings["lyrics"] = df["lyrics"]
df_embeddings["descriptors"] = df["descriptors"]
df_embeddings["genres"] = df["genres"]
columns = ["danceability_normalized", "energy_normalized", "acousticness_normalized", "instrumentalness_normalized", "valence_normalized", "tempo_normalized"]
df_embeddings[columns] = df[columns]
descriptors_list = [value.strip() for row in df_embeddings["descriptors"] for value in row.split(",")]
descriptors_count = Counter(descriptors_list)

genres_list = [value.strip() for row in df_embeddings["genres"] for value in row.split(",")]
genres_count = Counter(genres_list)


print("Descriptors frequencies (top 20):")
for descriptor, count in descriptors_count.most_common(20):
    print(f"{descriptor}: {count}")

print("\nGenres frequencies (top 20):")
for genre, count in genres_count.most_common(20):
    print(f"{genre}: {count}")

Descriptors frequencies (top 20):
Materialism: 752
Hedonism: 457
Aggression: 393
Narcissism: 349
Violence: 346
Ambition: 322
Recklessness: 316
Bravado: 302
Paranoia: 294
Defiance: 290
Rebellion: 287
Braggadocio: 265
Loyalty: 240
Nostalgia: 210
Swagger: 209
Regret: 194
Isolation: 190
Resilience: 186
Longing: 176
Betrayal: 171

Genres frequencies (top 20):
rap: 683
underground hip hop: 679
hip hop: 470
vapor trap: 363
trap: 232
dark trap: 229
plugg: 191
alternative hip hop: 190
pluggnb: 171
emo rap: 116
melodic rap: 115
: 104
psychedelic hip hop: 96
experimental hip hop: 84
atl hip hop: 81
southern hip hop: 80
glitchcore: 74
new orleans rap: 72
pop rap: 69
rock: 69

from collections import Counter
import random


N = 10

for i in range(25):
    cluster_df = df_embeddings[df_embeddings["cluster"] == i]

    descriptors_list = [value.strip() for row in cluster_df["descriptors"] for value in row.split(",")]
    descriptors_count = Counter(descriptors_list)

    genres_list = [value.strip() for row in cluster_df["genres"] for value in row.split(",")]
    genres_count = Counter(genres_list)

    danceability_count = Counter(cluster_df["danceability_normalized"])
    energy_count = Counter(cluster_df["energy_normalized"])
    acousticness_count = Counter(cluster_df["acousticness_normalized"])
    instrumentalness_count = Counter(cluster_df["instrumentalness_normalized"])
    valence_count = Counter(cluster_df["valence_normalized"])
    tempo_count = Counter(cluster_df["tempo_normalized"])

    print(f"\nCluster {i}:")
    print(f"Top {N} descriptors:")
    for descriptor, count in descriptors_count.most_common(N):
        print(f"- {descriptor}: {count}")

    print(f"Top {N} genres:")
    for genre, count in genres_count.most_common(N):
        print(f"- {genre}: {count}")

    print(f"Top {N} danceability values:")
    for value, count in danceability_count.most_common(N):
        print(f"- {value}: {count}")

    print(f"Top {N} energy values:")
    for value, count in energy_count.most_common(N):
        print(f"- {value}: {count}")

    print(f"Top {N} acousticness values:")
    for value, count in acousticness_count.most_common(N):
        print(f"- {value}: {count}")

    print(f"Top {N} instrumentalness values:")
    for value, count in instrumentalness_count.most_common(N):
        print(f"- {value}: {count}")

    print(f"Top {N} valence values:")
    for value, count in valence_count.most_common(N):
        print(f"- {value}: {count}")

    print(f"Top {N} tempo values:")
    for value, count in tempo_count.most_common(N):
        print(f"- {value}: {count}")

    print()

to_remove = ["Power", "Desperation ", "Despair", "Loneliness", "Arrogance", "Disillusionment", "Materialism", "Hedonism", "Aggression", "Narcissism", "Violence", "Recklessness", "Ambition", "Defiance", "Bravado", "Paranoia", "Rebellion", "Loyalty", "Braggadocio", "Swagger", "Nostalgia", "Betrayal", "Resilience", "Regret", "Isolation", "Longing", "rap", "underground hip hop", "hip hop", "vapor trap", "trap", "dark trap"]

df_embeddings["descriptors"] = df_embeddings["descriptors"].apply(lambda x: ','.join([value for value in x.split(",") if value.strip() not in to_remove]))

df_embeddings["genres"] = df_embeddings["genres"].apply(lambda x: ','.join([value for value in x.split(",") if value.strip() not in to_remove]))

playlist_texts = []
for i in range(0, 25):
    cluster_df = df_embeddings[df_embeddings["cluster"] == i].copy()

    cluster_df['audio_features'] = cluster_df.apply(lambda row: f"danceability_normalized: {row['danceability_normalized']} energy_normalized: {row['energy_normalized']} acousticness_normalized: {row['acousticness_normalized']} instrumentalness_normalized: {row['instrumentalness_normalized']} valence_normalized: {row['valence_normalized']} tempo_normalized: {row['tempo_normalized']}", axis=1)

    n_samples = min(len(cluster_df), 50)
    sample_df = cluster_df.sample(n=n_samples)

    songs = []
    for _, row in sample_df.iterrows():
        song_str = ""
        song_str += row["descriptors"] + " "
        song_str += row["audio_features"]
        songs.append(song_str)

    playlist_text = " ".join(songs)

    playlist_texts.append(playlist_text)


    playlist_texts.append(playlist_text)

import openai
import concurrent.futures
def create_descriptor(playlist_text, i):
    response = openai.ChatCompletion.create(
        model='gpt-4',
        messages=[
            {"role": "system", "content": "You are a helpful and an artistically gifted, creative speaker, here to help with any questions I may have."},
            {"role": "user", "content": f"Take this conjoined block of lyrical descriptors, song genres, and audio descriptors for a specific playlist: {playlist_text}. Translate specific emotional sentiments, dominant audio features, and prominent genres of the playlist into a detailed, well-rounded description. Absolutely do not call it a mix, or eclectic, or a concotion, or any other mixture of things. Discover 1-2 specific patterns or features and write directly on those. Each playlist must be 100% unique in its identity, not a variety."}
        ],
        temperature=0.7,
        max_tokens=1500,
    )
    return i, response["choices"][0]["message"]["content"]

descriptions = [None] * len(playlist_texts)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(create_descriptor, playlist_text.strip(), i) for i, playlist_text in enumerate(playlist_texts)]
    for future in concurrent.futures.as_completed(futures):
        i, descriptor = future.result()
        descriptions[i] = descriptor

with open('playlist_descriptions.txt', 'w') as f:
    for i, description in enumerate(descriptions):
        f.write(f"\n\nResponse for playlist {i+1}:\n{description}\n")

artists = [
    "Leonardo da Vinci in the style of Italian Renaissance",
    "Michelangelo Buonarroti in the style of Italian Renaissance",
    "Pablo Picasso in the style of Cubism and Surrealism",
    "Vincent van Gogh in the style of Post-Impressionism",
    "Rembrandt van Rijn in the style of Dutch Golden Age",
    "Salvador Dalí in the style of Surrealism",
    "Frida Kahlo in the style of Mexican Folk Art and Surrealism",
    "Georgia O'Keeffe in the style of American Modernism",
    "Jackson Pollock in the style of Abstract Expressionism",
    "Claude Monet in the style of Impressionism",
    "Henri Matisse in the style of Fauvism and Modernism",
    "Johannes Vermeer in the style of Dutch Golden Age",
    "Piet Mondrian in the style of De Stijl and Abstract Art",
    "Caravaggio in the style of Baroque",
    "Peter Paul Rubens in the style of Baroque",
    "Diego Velázquez in the style of Baroque",
    "Francisco Goya in the style of Romanticism",
    "Edouard Manet in the style of Realism and Impressionism",
    "Paul Cézanne in the style of Post-Impressionism",
    "Auguste Rodin in the style of Sculpture and Modernism",
    "Katsushika Hokusai in the style of Ukiyo-e and Japanese Art",
    "Titian in the style of Italian Renaissance",
    "Raphael Sanzio in the style of Italian Renaissance",
    "Sandro Botticelli in the style of Italian Renaissance",
    "Henri Cartier-Bresson in the style of Photography and Documentary",
    "Édgar Degas in the style of Impressionism",
    "Gustave Courbet in the style of Realism",
    "Frida Kahlo in the style of Surrealism and Mexican Art",
    "Marcel Duchamp in the style of Dada and Conceptual Art",
    "Marc Chagall in the style of Cubism and Expressionism",
    "Wassily Kandinsky in the style of Expressionism and Abstract Art",
    "Giotto di Bondone in the style of Italian Gothic",
    "Mary Cassatt in the style of Impressionism",
    "Egon Schiele in the style of Expressionism and Austrian Art",
    "Edward Hopper in the style of American Realism and American Scene Painting",
    "Lucian Freud in the style of Expressionism and Portraiture",
    "Artemisia Gentileschi in the style of Baroque",
    "Yayoi Kusama in the style of Pop Art and Contemporary Art",
    "Henri de Toulouse-Lautrec in the style of Post-Impressionism",
    "Max Ernst in the style of Dadaism, Surrealism, and Maximalism",
    "James Whistler in the style of American Art and Painting",
    "Kazimir Malevich in the style of Suprematism",
    "Willem de Kooning in the style of Abstract Expressionism and Dutch-American Art",
    "Cindy Sherman in the style of Photography and Conceptual Art",
    "Jan Van Eyck in the style of Flemish Renaissance",
    "Paul Klee in the style of Abstract Art, Modernism, and Expressionism",
    "Roy Lichtenstein in the style of Pop Art",
    "Hans Holbein the Younger in the style of German Renaissance",
    "Donatello in the style of Italian Renaissance Sculpture",
    "Josef Albers in the style of Abstract Art, Bauhaus, and Geometric Abstraction"
]

import openai
import concurrent.futures


def generate_art_prompt(description):
    try:
        artist = random.choice(artists)
        artists.remove(artist)
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an eloquent musical bot trained to analyze the emotional sentiment and atmosphere of a playlist."},
                {"role": "user", "content": f"Based on this playlist description: {description}, Give a one paragraph prompt for a visual art piece that you feels represents its overarching emotional sentiment and themes. For the art, specify that it is done by {artist}. Then offer a subject matter to be created in their exact style. Be as specific and complete as possible with your description. Give only concrete visual elements (medium, technique, colors, subject matter, scene, etc.) that are not up for interpretation."}
            ],
            temperature=1,
            max_tokens=3000
        )
        art_prompt = response["choices"][0]["message"]["content"].strip()
        return art_prompt
    except Exception as e:
        print(f"Error generating response for playlist text: {description}")
        print(e)
        return None

def save_to_txt(art_prompts, file_name="art_prompts.txt"):
    with open(file_name, "w", encoding="utf-8") as f:
        for prompt in art_prompts:
            f.write(prompt + "\n\n")

art_prompts = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    future_results = {executor.submit(generate_art_prompt, description): description for description in descriptions}

    for future in concurrent.futures.as_completed(future_results):
        input_description = future_results[future]
        try:
            art_prompt = future.result()
            if art_prompt is not None:
                art_prompts.append(art_prompt)
        except Exception as e:
            print(f"Error processing response for playlist text: {input_description}")
            print(e)

save_to_txt(art_prompts)

import openai


def generate_images(prompts):
    for prompt in prompts:
        response = openai.Image.create(
            prompt=prompt,
            n=1,
            size="1024x1024"
        )
        for image_data in response['data']:
            image_url = image_data['url']
            print(f"Prompt: {prompt}\nImage URL: {image_url}\n")
    return

generate_images(art_prompts)

import os

output_dir = "cluster_csvs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i in range(0, 25):
    cluster_df = df_embeddings[df_embeddings["cluster"] == i]
    cluster_df["track_id"] = df["track_id"]
    cluster_csv_filename = f"cluster_{i}.csv"
    cluster_csv_filepath = os.path.join(output_dir, cluster_csv_filename)
    cluster_df.to_csv(cluster_csv_filepath, index=False)

import os
import pandas as pd
from tqdm import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from pprint import pprint
from spotipy.client import SpotifyException
import base64



auth_manager = SpotifyOAuth(client_id=CLIENT_ID,
                            client_secret=CLIENT_SECRET,
                            redirect_uri=REDIRECT_URI,
                            scope=SCOPE,
                            )
sp = spotipy.Spotify(auth_manager=auth_manager, auth=access_token)




csv_directory = "/Users/gavinmason/RawC/Jupyter/Botify/cluster_csvs"

csv_files = [f for f in os.listdir(csv_directory) if f.endswith(".csv")]

for csv_file in csv_files:
    file_path = os.path.join(csv_directory, csv_file)
    df = pd.read_csv(file_path)
    track_uris = df['track_id'].tolist()

    playlist_name = f"T2 {csv_file[:-4]}"
    try:
        playlist = sp.user_playlist_create(user=sp.current_user()['id'], name=playlist_name, public=False)
        for i in range(0, len(track_uris), 100):
            sp.playlist_add_items(playlist_id=playlist['id'], items=track_uris[i:i+100])
    except SpotifyException as e:
        print(f"Error creating playlist: {e}")
        continue
    print(f"Playlist '{playlist_name}' created with {len(track_uris)} tracks.")

Playlist 'T2 cluster_9' created with 105 tracks.
Playlist 'T2 cluster_8' created with 50 tracks.
Playlist 'T2 cluster_20' created with 85 tracks.
Playlist 'T2 cluster_21' created with 29 tracks.
Playlist 'T2 cluster_23' created with 44 tracks.
Playlist 'T2 cluster_22' created with 58 tracks.
Playlist 'T2 cluster_19' created with 118 tracks.
Playlist 'T2 cluster_18' created with 66 tracks.
Playlist 'T2 cluster_24' created with 27 tracks.
Playlist 'T2 cluster_15' created with 12 tracks.
Playlist 'T2 cluster_14' created with 33 tracks.
Playlist 'T2 cluster_16' created with 74 tracks.
Playlist 'T2 cluster_17' created with 41 tracks.
Playlist 'T2 cluster_13' created with 73 tracks.
Playlist 'T2 cluster_12' created with 27 tracks.
Playlist 'T2 cluster_10' created with 129 tracks.
Playlist 'T2 cluster_11' created with 55 tracks.
Playlist 'T2 cluster_5' created with 118 tracks.
Playlist 'T2 cluster_4' created with 43 tracks.
Playlist 'T2 cluster_6' created with 107 tracks.
Playlist 'T2 cluster_7' created with 70 tracks.
Playlist 'T2 cluster_3' created with 104 tracks.
Playlist 'T2 cluster_2' created with 143 tracks.
Playlist 'T2 cluster_0' created with 39 tracks.
Playlist 'T2 cluster_1' created with 100 tracks.

import cloudinary
import cloudinary.api
import cloudinary.uploader

cloudinary.config(
    cloud_name="",
    api_key="",
    api_secret="",
    secure=True,
)

<cloudinary.Config at 0x7fb43ebd1b70>

def save_images_to_html(image_urls, output_filename="images.html"):
    html = "<!DOCTYPE html>\n<html>\n<head>\n<title>Images</title>\n</head>\n<body>\n"

    for image_url in image_urls:
        html += f'<img src="../{image_url}" alt="Image" style="width: 300px; height: auto; padding: 10px;">\n'

    html += "</body>\n</html>"

    with open(output_filename, "w") as f:
        f.write(html)

    print(f"HTML file with images saved as {output_filename}")

save_images_to_html(image_urls)

HTML file with images saved as images.html

from IPython.display import display, Image
def display_images(image_urls):
    for image_url in image_urls:
        display(Image(url=image_url, width=600))
display_images(image_urls)

	title	release_date	artists	album	popularity	genres	duration	explicit	track_id	artist_ids	...	tempo	speechiness	acousticness	instrumentalness	time_signature	danceability	key	loudness	valence	mode
1243	Dmtri	2019-11-22	Action Bronson, The Alchemist	Lamb Over Rice	53	NaN	274133	True	1CEVg3GpmapQ5rK5feHbFc	[7BMccF0hQFBpP6417k1OtQ, 0eVyjRhzZKke2KFYTcDkeu]	...	81.024	0.478	0.6740	0.000000	4	0.599	9	-6.290	0.300	0
1735	buku	2023-03-10	wolfacejoeyy	buku	59	NaN	137691	True	7eOAf1CKZenIfLyaV01MgX	[7LjXznzJeKuvjRbNcBWd6Z]	...	139.984	0.209	0.8130	0.001450	4	0.645	1	-8.633	0.534	1
183	Evil Fantasy	2018-01-08	Freddie Dredd	Evil Fantasy	58	NaN	114128	True	3A4TIpLeMng0Nn0N7yre4g	[0dlDsD7y6ccmDm8tuWCU6F]	...	80.028	0.316	0.2200	0.000321	4	0.855	5	-5.976	0.655	1
313	BIG	2019-09-27	Young M.A	Herstory in the Making	64	NaN	225919	True	7cyeGclH71MayOuQ7qtLFl	[7LvoDJUNGnOrPdGRzVtOJ9]	...	85.005	0.369	0.0408	0.000006	4	0.860	0	-5.776	0.150	0
1475	Mob Shit	2022-08-04	Warhol.SS	3PEAT - EP	24	NaN	142368	True	21RvkkYHi4ApQeWCaR2J0n	[3O7mUqVuHmLJqHXbFiQY62]	...	148.043	0.296	0.0684	0.000000	4	0.712	11	-4.538	0.830	0

	title	release_date	artists	album	popularity	genres	duration	explicit	track_id	artist_ids	...	tempo	speechiness	acousticness	instrumentalness	time_signature	danceability	key	loudness	valence	mode
1519	Crime Story	1999-06-22	Shyheim	Manchild	14	hardcore hip hop, wu fam	281600	True	4EJJKWY89WxPhqDddB5iA0	[0rqb5s3Cxj0lGr6jTrWc0P]	...	88.051	0.3190	0.5330	0.0000	4	0.570	7	-6.016	0.743	1
1157	Play Em Like Atari	2022-02-10	Drain Gang Archive	Play Em Like Atari	49	drain	229353	False	34uzB2MBbxtrXMhYPUfz2g	[1NZYxkl40ebW2QiwkMctIY]	...	120.020	0.0567	0.5320	0.0029	4	0.450	7	-9.794	0.111	1
1532	Paranormal Snaptivity	2021-07-20	Zelooperz, RX Nephew, Quadie Diesel	Van Goghs Left Ear	30	alternative hip hop, detroit hip hop, indie hi...	163747	True	67dAJ7uV2yPgElWGztOjJF	[5j93hwFBNo29RJMsWvtzj8, 7f3pexehhZwFV05N3csJY...	...	169.143	0.3350	0.0169	0.0000	4	0.584	1	-6.183	0.177	0
1509	Last Dayz (S.i.p Willie & Bogle)	2021-08-17	Sickboyrari	Chain Gang Halo World	30	glitchcore, underground hip hop, vapor trap	143404	True	4TGKToBI1GE3AH07TVixZt	[395BAMokcNaqWcCl8uU1ky]	...	79.201	0.2850	0.2960	0.0018	4	0.444	6	-10.337	0.863	1
1705	Oh Lord	2022-02-02	iayze	The Leek 4!	43	plugg	89216	True	536W1SryvhPjW4YjxVg9zA	[00msLVGU9crX0EC5McCiCa]	...	140.132	0.0942	0.3640	0.0000	4	0.575	7	-10.456	0.342	0

	title	release_date	artists	album	popularity	genres	duration	explicit	track_id	artist_ids	...	instrumentalness	time_signature	danceability	key	loudness	valence	mode	lyrics	tokens	descriptors
0	O Pana!	2016-09-11	$uicideboy$	Eternal Grey	0	dark trap, new orleans rap, underground hip hop	128417	True	2I12vOWeJU5ayhr6ha6esf	['1VPmR4DJC1PlOtd0IADAO0']	...	0.000021	4	0.800	0	-6.540	0.179	1	crush it up hit the blunt hit the blunt crush ...	413	Sensuality, Confidence, Intoxication, Infatuat...
1	Runnin' Thru The 7th With My Woadies	2015-09-01	$uicideboy$, Pouya	$outh $ide $uicide	0	dark trap, new orleans rap, underground hip ho...	205766	True	70nmZhHZLNVYWP4NON41Zw	['1VPmR4DJC1PlOtd0IADAO0', '4nXOZlYoAD67hF9aUE...	...	0.000241	4	0.526	8	-7.244	0.187	0	when you come home ill stop this when you come...	529	Desperation, Paranoia, Resentment, Self-Doubt,...
2	ELEMENT.	2017-04-14	Kendrick Lamar	DAMN.	72	conscious hip hop, hip hop, rap, west coast rap	208733	True	1EaKU4dMbesXXd3BrLCtYG	['2YZyLoL8N0Wb9xBt1NhZWg']	...	0.000000	4	0.748	4	-4.547	0.483	0	kenny aint nobody prayin for me yall know what...	776	Darkness, Rebellion, Death, Isolation, Misanth...
3	Boredom (feat. Rex Orange County & Anna of the...	2017-07-21	Tyler, The Creator, Rex Orange County, Anna of...	Flower Boy	71	hip hop, rap, bedroom pop, pop, electropop, in...	320720	True	5WNYg3usc6H8N3MBEp4zVk	['4V8LLVI7PbaPR0K2TGSxFF', '7pbDxGE6nQSZVfiFdq...	...	0.000522	4	0.559	8	-9.540	0.239	1	youre having fun fun time flies time flies tim...	600	Violence, Aggression, Self-Destruction, Narcis...
4	White Ferrari	2016-08-20	Frank Ocean	Blonde	77	hip hop, lgbtq+ hip hop, neo soul	248807	False	2LMkwUfqC6S6s6qDVlEuzV	['2h93pZq0e7k5yf4dywlkpM']	...	0.000000	4	0.501	0	-15.605	0.204	1	rides mind on the road your dilated eyes watch...	280	Desperation, Longing, Obsession, Paranoia, Reg...

	popularity	energy	liveness	tempo	speechiness	acousticness	instrumentalness	time_signature	danceability	key	loudness	valence	mode
count	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000	1750.000000
mean	39.984000	0.599483	0.206333	123.797894	0.179418	0.241785	0.073885	3.954857	0.654890	5.195429	-8.215059	0.420078	0.617714
std	25.426781	0.180084	0.163178	28.933340	0.143432	0.267291	0.208976	0.347690	0.161264	3.740218	3.332309	0.231564	0.486085
min	0.000000	0.022300	0.028000	0.000000	0.000000	0.000003	0.000000	0.000000	0.000000	0.000000	-26.134000	0.000000	0.000000
25%	19.000000	0.474250	0.106000	100.034250	0.055100	0.027025	0.000000	4.000000	0.554000	1.000000	-9.917250	0.229000	0.000000
50%	46.000000	0.599000	0.134000	127.017000	0.138000	0.125500	0.000006	4.000000	0.677000	5.000000	-7.736500	0.401000	1.000000
75%	60.000000	0.731000	0.252000	142.951250	0.277000	0.376750	0.002260	4.000000	0.772000	9.000000	-5.971000	0.585500	1.000000
max	97.000000	0.999000	0.980000	210.164000	0.915000	0.993000	0.960000	5.000000	0.985000	11.000000	0.915000	0.982000	1.000000

Botify - Spotify Playlist Processing and Visualization¶

Spotify API Access Token Retrieval and Spotipy¶

Using Spotipy Library for Spotify Web API Interaction¶

Retrieving Playlist Tracks and Extracting Data¶

Fetching Audio Features and Extending DataFrame¶

Fetching Genres Associated with Each Track¶

Fetching Lyrics for Tracks Using Genius and MusixMatch APIs¶

Song Lyrics Searching and Validation Process¶

Song Lyrics Searching and Validation Process (Continued)¶

1. Create a convenience filter/search method¶

2. Use Google search for manual input of lyrics¶

Preprocessing Song Lyrics for AI Analysis¶

Generating Descriptors for Song Lyrics with OpenAI GPT-3.5 Turbo¶

Data Exploration and Visualization for Feature Selection¶

Scaling and Normalizing Data with StandardScaler¶

Analyzing Feature Importance with RandomForestClassifier¶

Assessing Multicollinearity with Variance Inflation Factor (VIF)¶

Feature Selection for Analysis¶

Analyzing Histograms and Kernel Density Plots¶

Insights from the Plots¶

Tackling Skewness with Transformations¶

Transformations:¶

Checking Out the Results¶

Trying Log Transformation and Binning on Instrumentalness Feature¶

Log Transformation¶

Binning¶

Applying Quantile Transformation¶

Results of Quantile Transformation¶

Results of Quantile Transformation¶

'Number to Text' Translator for Standardized Features¶

Converting Features to Text Embeddings¶

Embedding Process¶

Importance of Vector Length¶

Dividing Features into Three Embedding Sets¶

Embedding Set Statistical Testing¶

Comparing Embedding Sets with K-means Clustering and Silhouette Score¶

Silhouette Score¶

Initial Scores and Insights¶

Combining Embeddings with Custom Weights¶

Steps to combine the embeddings with custom weights:¶

Testing Clustering Methods and Dimensionality Reduction Techniques¶

Clustering Method: K-means¶

Dimensionality Reduction Techniques¶

Techniques:¶

Additional Clustering Evaluation Metrics: DBI and CHI¶

Davies-Bouldin Index (DBI)¶

Calinski-Harabasz Index (CHI)¶

Scatter Matrices and CHI¶

Hierarchical Clustering: Agglomerative Clustering¶

Agglomerative Clustering Algorithm¶

Components of Agglomerative Clustering¶

Exploring Spectral Clustering¶

Visualization of Clusters¶

Winner: K-Means with UMAP Dimensionality Reduction¶

The Winning Method Revisualized¶

Analyzing and Exploring Clustering Results¶

Calculating Frequencies of Embedded Factors¶

Reducing Frequency for Creativity¶

Removing Dominant Data¶

Generating Playlist Descriptions with AI¶

Steps to generate playlist descriptions:¶

Generating Refined Descriptions with AI and Artist Styles¶

Generating Custom Spotify Cover Art with DALL-E¶

Saving Playlists to CSV and Creating Playlists on Spotify¶

Playlists Made!!!!¶

They can all be viewed publicly here:¶

Down below I will display, in larger form, all of the AI generated art that was created throughout this process. About half were not used due to personal preference or just trial and error of my system.¶

That Concludes Botify 1.0 ---- Botify Grime (2.0) In Development.¶

Stay Tuned¶