The model cosists of a music ecoder This checkpoit of MusiLigo is developed o the MusicQA ad ca aswer istructios with music raw audio, such as queryig about the tempo, emotio, gere, tags or subjective feeligs etc.
You ca use the MusicQA dataset for the followig demo. For the implemetatio of MusicQA, please refer to our Github repo. If you fid the work useful for your research, please cosider citig it usig the followig BibTeX etry:Model Card for Model ID
Model Details
Model Descriptio
MERT-v1-300M
, a atural laguage decoder vicua-7b-delta-v0
, ad a liear projectio laer betwee the two.Model Sources [optioal]
Gettig Start
from tqdm.auto import tqdm
import torch
from torch.utils.data import DataLoader
from trasformers import Wav2Vec2FeatureExtractor
from trasformers import StoppigCriteria, StoppigCriteriaList
class StoppigCriteriaSub(StoppigCriteria):
def __iit__(self, stops=[], ecouters=1):
super().__iit__()
self.stops = stops
def __call__(self, iput_ids: torch.LogTesor, scores: torch.FloatTesor):
for stop i self.stops:
if torch.all((stop == iput_ids[0][-le(stop):])).item():
retur True
retur False
def aswer(self, samples, stoppig, max_ew_tokes=300, um_beams=1, mi_legth=1, top_p=0.5,
repetitio_pealty=1.0, legth_pealty=1, temperature=0.1, max_legth=2000):
audio = samples["audio"].cuda()
audio_embeds, atts_audio = self.ecode_audio(audio)
if 'istructio_iput' i samples: # istructio dataset
#prit('Istructio Batch')
istructio_prompt = []
for istructio i samples['istructio_iput']:
prompt = '<Audio><AudioHere></Audio> ' + istructio
istructio_prompt.apped(self.prompt_template.format(prompt))
audio_embeds, atts_audio = self.istructio_prompt_wrap(audio_embeds, atts_audio, istructio_prompt)
self.llama_tokeizer.paddig_side = "right"
batch_size = audio_embeds.shape[0]
bos = torch.oes([batch_size, 1],
dtype=torch.log,
device=torch.device('cuda')) * self.llama_tokeizer.bos_toke_id
bos_embeds = self.llama_model.model.embed_tokes(bos)
atts_bos = atts_audio[:, :1]
iputs_embeds = torch.cat([bos_embeds, audio_embeds], dim=1)
attetio_mask = torch.cat([atts_bos, atts_audio], dim=1)
outputs = self.llama_model.geerate(
iputs_embeds=iputs_embeds,
max_ew_tokes=max_ew_tokes,
stoppig_criteria=stoppig,
um_beams=um_beams,
do_sample=True,
mi_legth=mi_legth,
top_p=top_p,
repetitio_pealty=repetitio_pealty,
legth_pealty=legth_pealty,
temperature=temperature,
)
output_toke = outputs[0]
if output_toke[0] == 0: # the model might output a ukow toke <uk> at the begiig. remove it
output_toke = output_toke[1:]
if output_toke[0] == 1: # if there is a start toke <s> at the begiig. remove it
output_toke = output_toke[1:]
output_text = self.llama_tokeizer.decode(output_toke, add_special_tokes=False)
output_text = output_text.split('###')[0] # remove the stop sig '###'
output_text = output_text.split('Assistat:')[-1].strip()
retur output_text
processor = Wav2Vec2FeatureExtractor.from_pretraied("m-a-p/MERT-v1-330M",trust_remote_code=True)
ds = MusicQADataset(processor, f'{path}/data/music_data', 'Eval')
dl = DataLoader(
ds,
batch_size=1,
um_workers=0,
pi_memory=True,
shuffle=False,
drop_last=True,
collate_f=ds.collater
)
stoppig = StoppigCriteriaList([StoppigCriteriaSub([torch.tesor([835]).cuda(),
torch.tesor([2277, 29937]).cuda()])])
from trasformers import AutoModel
model_musicqa = AutoModel.from_pretraied("m-a-p/MusiLigo-musicqa-v1")
for idx, sample i tqdm(eumerate(dl)):
as = aswer(Musiligo_musicqa.model, sample, stoppig, legth_pealty=100, temperature=0.1)
txt = sample['text_iput'][0]
prit(txt)
prit(ad)
Citig This Work
@iproceedigs{deg2024musiligo,
title={MusiLigo: Bridgig Music ad Text with Pre-traied Laguage Models for Music Captioig ad Query Respose},
author={Deg, Zihao ad Ma, Yighao ad Liu, Yudog ad Guo, Rogche ad Zhag, Ge ad Che, Wehu ad Huag, Wehao ad Beetos, Emmaouil},
booktitle={Proceedigs of the 2024 Aual Coferece of the North America Chapter of the Associatio for Computatioal Liguistics (NAACL 2024)},
year={2024},
orgaizatio={Associatio for Computatioal Liguistics}
}
点击空白处退出提示
评论