voxceleb.py 2.93 KB
import argparse
from utils import SubCommandRunner

def utt2spk(features: str, outfile: str):
    """Generate a utt2spk file from a feature file of voxceleb.
    (it also works with list files instead of features)

    Args:
        features (str): features file (or list)
        outfile (str): output file to store the utt2spk
    """
    with open(features, "r") as f, open(outfile, "w") as out:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            id_ = splited[0]
            id_splited = id_.split("-")
            spk = id_splited[0]
            out.write(id_ + " " + spk + "\n")


def spk2utt(features: str, outfile: str):
    """Generate a spk2utt file from a feature file of voxceleb.
    (it also works with list files instead of features)

    Args:
        features (str): features file (or list)
        outfile (str): output file to store the spk2utt
    """
    with open(features, "r") as f, open(outfile, "w") as out:
        spk2utt_dict = {}
        for line in f:
            splited = line.replace("\n", "").split(" ")
            id_ = splited[0]
            id_splited = id_.split("-")
            spk = id_splited[0]
            if spk not in spk2utt_dict:
                spk2utt_dict[spk] = []
            spk2utt_dict[spk].append(id_)
        
        for spk, ids in spk2utt_dict.items():
            out.write(spk + " " + " ".join(ids) + "\n")


def wavscp(datadir: str, outfile: str):
    raise Exception("Under construction")
    pass

if __name__ == "__main__":   
    # Main parser
    parser = argparse.ArgumentParser(description="Voxceleb data management")
    subparsers = parser.add_subparsers(title="action")

    # utt2spk
    parser_utt2spk = subparsers.add_parser("utt2spk", help="Generate utt2spk file from feature file (works with list).")
    parser_utt2spk.add_argument("--features", required=True, help="Features file (works with list)")
    parser_utt2spk.add_argument("--outfile", default="utt2spk", help="output file")
    parser_utt2spk.set_defaults(which="utt2spk")

    # spk2utt
    parser_spk2utt = subparsers.add_parser("spk2utt", help="Generate spk2utt file from feature file (works with list).")
    parser_spk2utt.add_argument("--features", required=True, help="Features file (works with list)")
    parser_spk2utt.add_argument("--outfile", default="spk2utt", help="output file")
    parser_spk2utt.set_defaults(which="spk2utt")

    # wavscp
    parser_wavscp = subparser.add_parser("wavscp", help="generate wav scp file")
    parser_wavscp.add_argument("--datadir", required=True, help="data directory of masseffect")
    parser_wavscp.add_argument("--outfile", default="wav.scp", help="wav.scp output file")
    parser_wavscp.set_defaults(which="wavscp")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "utt2spk" : utt2spk,
        "spk2utt": spk2utt,
        "wavscp": wavscp
    })

    runner.run(args.which, args.__dict__, remove="which")