marytts.htsengine.HTSVocoder Maven / Gradle / Ivy
/* ----------------------------------------------------------------- */
/* The HMM-Based Speech Synthesis Engine "hts_engine API" */
/* developed by HTS Working Group */
/* http://hts-engine.sourceforge.net/ */
/* ----------------------------------------------------------------- */
/* */
/* Copyright (c) 2001-2010 Nagoya Institute of Technology */
/* Department of Computer Science */
/* */
/* 2001-2008 Tokyo Institute of Technology */
/* Interdisciplinary Graduate School of */
/* Science and Engineering */
/* */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* - Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* - Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials provided */
/* with the distribution. */
/* - Neither the name of the HTS working group nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* ----------------------------------------------------------------- */
/**
* Copyright 2011 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
*/
package marytts.htsengine;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import marytts.signalproc.process.AmplitudeNormalizer;
import marytts.util.MaryUtils;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.ProducingDoubleDataSource;
import marytts.util.data.audio.AudioPlayer;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.io.LEDataInputStream;
import marytts.util.math.FFT;
import marytts.util.math.MathUtils;
import marytts.htsengine.HMMData;
import org.apache.log4j.Logger;
/**
* Synthesis of speech out of speech parameters. Mixed excitation MLSA vocoder.
*
* Java port and extension of HTS engine API version 1.04 Extension: mixed excitation
*
* @author Marcela Charfuelan
*/
public class HTSVocoder {
public static final int IPERIOD = 1; /* interpolation period */
public static final int SEED = 1;
public static final int PADEORDER = 5; /* pade order for MLSA filter */
public static final int IRLENG = 96; /* length of impulse response */
public static final double ZERO = 1.0e-10; /* ~(0) */
public static final double LZERO = (-1.0e+10); /* ~log(0) */
/* ppade is a copy of pade in mlsadf() function : ppade = &( pade[pd*(pd+1)/2] ); */
static final double[] pade = new double[] { /* used in mlsadf */
1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0.4999273, 0.1067005, 0.01170221, 0.0005656279, 1, 0.4999391, 0.1107098, 0.01369984,
0.0009564853, 0.00003041721 };
static final int ppade = PADEORDER * (PADEORDER + 1) / 2; /* offset for vector pade */;
private static final Logger logger = MaryUtils.getLogger("Vocoder");
private Random rand;
private int stage; /* Gamma=-1/stage : if stage=0 then Gamma=0 */
private double gamma; /* Gamma */
private boolean use_log_gain; /* log gain flag (for LSP) */
private int fprd; /* frame shift */
private double p1; /* used in excitation generation */
private double pc; /* used in excitation generation */
private double C[]; /* used in the MLSA/MGLSA filter */
private double CC[]; /* used in the MLSA/MGLSA filter */
private double CINC[]; /* used in the MLSA/MGLSA filter */
private double D1[]; /* used in the MLSA/MGLSA filter */
private double rate;
int pt2; /* used in mlsadf2 */
private final int pt3[] = new int[PADEORDER + 1]; /* used in mlsadf2 */
/* mixed excitation variables */
private int numM; /* Number of bandpass filters for mixed excitation */
private int orderM; /* Order of filters for mixed excitation */
private double h[][]; /* filters for mixed excitation */
private double xpulseSignal[]; /* the size of this should be orderM */
private double xnoiseSignal[]; /* the size of this should be orderM */
private boolean mixedExcitation = false;
private boolean fourierMagnitudes = false;
/**
* The initialisation of VocoderSetup should be done when there is already information about the number of feature vectors to
* be processed, size of the mcep vector file, etc.
*
* @param mcep_order
* mcep_order
* @param mcep_vsize
* mcep_vsize
* @param htsData
* htsData
*/
private void initVocoder(int mcep_order, int mcep_vsize, HMMData htsData) {
stage = htsData.getStage();
gamma = htsData.getGamma();
use_log_gain = htsData.getUseLogGain();
fprd = htsData.getFperiod();
rate = htsData.getRate();
rand = new Random(SEED);
C = new double[mcep_order];
CC = new double[mcep_order];
CINC = new double[mcep_order];
if (stage == 0) { /* for MGC */
/* mcep_order=74 and pd=PADEORDER=5 (if no HTS_EMBEDDED is used) */
int vector_size = (mcep_vsize * (3 + PADEORDER) + 5 * PADEORDER + 6) - (3 * (mcep_order));
D1 = new double[vector_size];
pt2 = (2 * (PADEORDER + 1)) + (PADEORDER * (mcep_order + 1));
for (int i = PADEORDER; i >= 1; i--)
pt3[i] = (2 * (PADEORDER + 1)) + ((i - 1) * (mcep_order + 1));
} else { /* for LSP */
int vector_size = ((mcep_vsize + 1) * (stage + 3)) - (3 * (mcep_order));
D1 = new double[vector_size];
}
/* excitation initialisation */
p1 = -1;
pc = 0.0;
} /* method initVocoder */
/**
* HTS_MLSA_Vocoder: Synthesis of speech out of mel-cepstral coefficients. This procedure uses the parameters generated in
* pdf2par stored in: PStream mceppst: Mel-cepstral coefficients PStream strpst : Filter bank stregths for mixed excitation
* PStream magpst : Fourier magnitudes PStream lf0pst : Log F0
*
* @param pdf2par
* pdf2par
* @param htsData
* htsData
* @throws Exception
* Exception
* @return DDSAudioInputStream
*/
public AudioInputStream htsMLSAVocoder(HTSParameterGeneration pdf2par, HMMData htsData) throws Exception {
int audioSize = computeAudioSize(pdf2par.getMcepPst(), htsData);
HTSVocoderDataProducer producer = new HTSVocoderDataProducer(audioSize, pdf2par, htsData);
producer.start();
return new DDSAudioInputStream(producer, getHTSAudioFormat(htsData));
/*
* double [] audio_double = null;
*
* audio_double = htsMLSAVocoder(pdf2par.getlf0Pst(), pdf2par.getMcepPst(), pdf2par.getStrPst(), pdf2par.getMagPst(),
* pdf2par.getVoicedArray(), htsData);
*
* long lengthInSamples = (audio_double.length * 2 ) / (sampleSizeInBits/8); logger.debug("length in samples=" +
* lengthInSamples );
*
* // Normalise the signal before return, this will normalise between 1 and -1 double MaxSample =
* MathUtils.getAbsMax(audio_double); for (int i=0; i automatically initialized to 0.0
h = htsData.getMixFilters();
hp = new double[orderM];
hn = new double[orderM];
// Check if the number of filters is equal to the order of strpst
// i.e. the number of filters is equal to the number of generated strengths per frame.
if (numM != strPst.getOrder()) {
logger.debug("htsMLSAVocoder: error num mix-excitation filters =" + numM
+ " in configuration file is different from generated str order=" + strPst.getOrder());
throw new Exception("htsMLSAVocoder: error num mix-excitation filters = " + numM
+ " in configuration file is different from generated str order=" + strPst.getOrder());
}
logger.debug("HMM speech generation with mixed-excitation.");
} else
logger.debug("HMM speech generation without mixed-excitation.");
if (fourierMagnitudes && htsData.getPdfMagStream() != null)
logger.debug("Pulse generated with Fourier Magnitudes.");
// else
// logger.info("Pulse generated as a unit pulse.");
if (beta != 0.0)
logger.debug("Postfiltering applied with beta=" + beta);
else
logger.debug("No postfiltering applied.");
f0Std = htsData.getF0Std();
f0Shift = htsData.getF0Mean();
f0MeanOri = 0.0;
for (mcepframe = 0, lf0frame = 0; mcepframe < mcepPst.getT(); mcepframe++) {
if (voiced[mcepframe]) {
f0MeanOri = f0MeanOri + Math.exp(lf0Pst.getPar(lf0frame, 0));
// System.out.println("voiced t=" + mcepframe + " " + lf0Pst.getPar(lf0frame, 0) + " ");
lf0frame++;
}
// else
// System.out.println("unvoiced t=" + mcepframe + " 0.0 ");
}
f0MeanOri = f0MeanOri / lf0frame;
/* _______________________Synthesize speech waveforms_____________________ */
/* generate Nperiod samples per mcepframe */
int s = 0; /* number of samples */
int s_double = 0;
int audio_size = computeAudioSize(mcepPst, htsData); /* audio size in samples, calculated as num frames * frame period */
double[] audio_double = new double[audio_size]; /* initialise buffer for audio */
magSample = 1;
magPulseSize = 0;
for (mcepframe = 0, lf0frame = 0; mcepframe < mcepPst.getT(); mcepframe++) { /* for each mcep frame */
/** feature vector for a particular frame */
double mc[] = new double[m]; /* feature vector for a particular frame */
/* get current feature vector mgc */
for (int i = 0; i < m; i++)
mc[i] = mcepPst.getPar(mcepframe, i);
/* f0 modification through the MARY audio effects */
double f0 = 0.0;
if (voiced[mcepframe]) {
f0 = f0Std * Math.exp(lf0Pst.getPar(lf0frame, 0)) + (1 - f0Std) * f0MeanOri + f0Shift;
lf0frame++;
f0 = Math.max(0.0, f0);
}
/*
* if mixed excitation get shaping filters for this frame the strength of pulse, is taken from the predicted value,
* which can be maximum 1.0, and the strength of noise is the rest -> 1.0 - strPulse
*/
double str = 0.0;
if (mixedExcitation) {
for (int j = 0; j < orderM; j++) {
hp[j] = hn[j] = 0.0;
for (int i = 0; i < numM; i++) {
str = strPst.getPar(mcepframe, i);
hp[j] += str * h[i][j];
hn[j] += (1 - str) * h[i][j];
// hp[j] += strPst.getPar(mcepframe, i) * h[i][j];
// hn[j] += ( 0.9 - strPst.getPar(mcepframe, i) ) * h[i][j];
}
}
}
/* f0 -> pitch , in original code here it is used p, so f0=p in the c code */
if (f0 != 0.0)
f0 = rate / f0;
/* p1 is initialised in -1, so this will be done just for the first frame */
if (p1 < 0) {
p1 = f0;
pc = p1;
/* for LSP */
if (stage != 0) {
C[0] = (use_log_gain) ? LZERO : ZERO;
double PI_m = Math.PI / m;
for (int i = 0; i < m; i++)
C[i] = i * PI_m;
/* LSP -> MGC */
lsp2mgc(C, C, (m - 1), alpha);
mc2b(C, C, (m - 1), alpha);
gnorm(C, C, (m - 1), gamma);
for (int i = 1; i < m; i++)
C[i] *= gamma;
}
}
if (stage == 0) {
/* postfiltering, this is done if beta>0.0 */
postfilter_mgc(mc, (m - 1), alpha, beta);
/* mc2b: transform mel-cepstrum to MLSA digital filter coefficients */
mc2b(mc, CC, (m - 1), alpha);
for (int i = 0; i < m; i++)
CINC[i] = (CC[i] - C[i]) * IPERIOD / fprd;
} else {
lsp2mgc(mc, CC, (m - 1), alpha);
mc2b(CC, CC, (m - 1), alpha);
gnorm(CC, CC, (m - 1), gamma);
for (int i = 1; i < m; i++)
CC[i] *= gamma;
for (int i = 0; i < m; i++)
CINC[i] = (CC[i] - C[i]) * IPERIOD / fprd;
}
/* p=f0 in c code!!! */
if (p1 != 0.0 && f0 != 0.0) {
inc = (f0 - p1) * (double) IPERIOD / (double) fprd;
} else {
inc = 0.0;
pc = f0;
p1 = 0.0;
// System.out.println(" inc=" + inc + " ***pc=" + pc + " p1=" + p1);
}
/* Here i need to generate both xp:pulse and xn:noise signals separately */
// gauss = false; /* Mixed excitation works better with nomal noise */
/* Generate fperiod samples per feature vector, normally 80 samples per frame */
// p1=0.0;
for (int j = fprd - 1, i = (IPERIOD + 1) / 2; j >= 0; j--) {
if (p1 == 0.0) {
x = uniformRand(); /* returns 1.0 or -1.0 uniformly distributed */
if (mixedExcitation) {
xn = x;
xp = 0.0;
}
} else {
if ((pc += 1.0) >= p1) {
if (fourierMagnitudes) {
magPulse = genPulseFromFourierMag(magPst, mcepframe, p1);
magSample = 0;
magPulseSize = magPulse.length;
x = magPulse[magSample];
magSample++;
} else
x = Math.sqrt(p1);
pc = pc - p1;
} else {
if (fourierMagnitudes) {
if (magSample >= magPulseSize) {
x = 0.0;
} else
x = magPulse[magSample];
magSample++;
} else
x = 0.0;
}
if (mixedExcitation) {
xp = x;
xn = uniformRand();
}
}
// System.out.print(" x=" + x);
/* apply the shaping filters to the pulse and noise samples */
/* i need memory of at least for M samples in both signals */
if (mixedExcitation) {
fxp = 0.0;
fxn = 0.0;
for (k = orderM - 1; k > 0; k--) {
fxp += hp[k] * xpulseSignal[k];
fxn += hn[k] * xnoiseSignal[k];
xpulseSignal[k] = xpulseSignal[k - 1];
xnoiseSignal[k] = xnoiseSignal[k - 1];
}
fxp += hp[0] * xp;
fxn += hn[0] * xn;
xpulseSignal[0] = xp;
xnoiseSignal[0] = xn;
/* x is a pulse noise excitation and mix is mixed excitation */
mix = fxp + fxn;
pulse[j] = fxp;
noise[j] = fxn;
source[j] = mix;
// System.out.format("%d = %f \n", j, mix);
/* comment this line if no mixed excitation, just pulse and noise */
x = mix; /* excitation sample */
}
if (stage == 0) {
if (x != 0.0)
x *= Math.exp(C[0]);
x = mlsadf(x, C, m, alpha, D1, pt2, pt3);
} else {
x *= C[0];
x = mglsadf(x, C, (m - 1), alpha, stage, D1);
}
// System.out.format("%f ", x);
audio_double[s_double] = x;
if (audioProducer != null) {
audioProducer.putOneDataPoint(x);
}
s_double++;
if ((--i) == 0) {
p1 += inc;
for (k = 0; k < m; k++) {
C[k] += CINC[k];
}
i = IPERIOD;
}
} /* for each sample in a period fprd */
/*********
* For debuging if(voiced[mcepframe]) { double magf[] = SignalProcUtils.getFrameHalfMagnitudeSpectrum(source, 512, 1);
* MaryUtils.plot(magf, "magf"); } System.out.format("str=%.2f\n", str);
*/
p1 = f0;
/* move elements in c */
System.arraycopy(CC, 0, C, 0, m);
} /* for each mcep frame */
logger.debug("Finish processing " + mcepframe + " mcep frames.");
return (audio_double);
} /* method htsMLSAVocoder() */
/**
* Compute the audio size, in samples, that this vocoder is going to produce for the given data.
*
* @param mcepPst
* mcepPst
* @param htsData
* htsData
* @return mcepPst.getT * htsData.getFperiod
*/
private int computeAudioSize(HTSPStream mcepPst, HMMData htsData) {
return mcepPst.getT() * htsData.getFperiod();
}
private void printVector(String val, int m, double vec[]) {
int i;
System.out.println(val);
for (i = 0; i < m; i++)
System.out.println("v[" + i + "]=" + vec[i]);
}
/**
* mlsafir: sub functions for MLSA filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param d
* d
* @param _pt3
* _pt3
* @return y
*/
private static double mlsafir(double x, double b[], int m, double a, double d[], int _pt3) {
d[_pt3 + 0] = x;
d[_pt3 + 1] = (1 - a * a) * d[_pt3 + 0] + (a * d[_pt3 + 1]);
for (int i = 2; i <= m; i++) {
d[_pt3 + i] += a * (d[_pt3 + i + 1] - d[_pt3 + i - 1]);
}
double y = 0.0;
for (int i = 2; i <= m; i++) {
y += d[_pt3 + i] * b[i];
}
for (int i = m + 1; i > 1; i--) {
d[_pt3 + i] = d[_pt3 + i - 1];
}
return y;
}
/**
* mlsdaf1: sub functions for MLSA filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param d
* d
* @return out
*/
private static double mlsadf1(double x, double b[], int m, double a, double d[]) {
// pt1 --> pt = &d1[pd+1]
double out = 0.0;
for (int i = PADEORDER; i > 0; i--) {
d[i] = (1 - a * a) * d[PADEORDER + i] + a * d[i];
d[PADEORDER + 1 + i] = d[i] * b[1];
double v = d[PADEORDER + 1 + i] * pade[ppade + i];
x += ((1 & i) == 1) ? v : -v;
/*
* if(i == 1 || i == 3 || i == 5) x += v; else x += -v;
*/
out += v;
}
d[PADEORDER + 1] = x;
out += x;
return out;
}
/**
* mlsdaf2: sub functions for MLSA filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param d
* d
* @param pt2
* pt2
* @param pt3
* pt3
* @return out
*/
private static double mlsadf2(double x, double b[], int m, double a, double d[], int pt2, int pt3[]) {
double out = 0.0;
// pt2 --> pt = &d1[pd * (m+2)]
// pt3 --> pt = &d1[ 2*(pd+1) ]
for (int i = PADEORDER; i > 0; i--) {
int pt2_plus_i = pt2 + i;
d[pt2_plus_i] = mlsafir(d[pt2_plus_i - 1], b, m, a, d, pt3[i]);
double v = d[pt2_plus_i] * pade[ppade + i];
x += ((1 & i) == 1) ? v : -v;
/*
* if(i == 1 || i == 3 || i == 5) x += v; else x += -v;
*/
out += v;
}
d[pt2 /* +0 */] = x;
out += x;
return out;
}
/**
* mlsadf: HTS Mel Log Spectrum Approximation filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param d
* d
* @param pt2
* pt2
* @param pt3
* pt3
* @return x
*/
public static double mlsadf(double x, double b[], int m, double a, double d[], int pt2, int pt3[]) {
x = mlsadf1(x, b, m, a, d);
x = mlsadf2(x, b, m - 1, a, d, pt2, pt3);
return x;
}
/**
* uniform_rand: generate uniformly distributed random numbers 1 or -1
*
* @return rand.nextboolean
*/
public double uniformRand() {
return (rand.nextBoolean()) ? 1.0 : -1.0;
}
/**
* mc2b: transform mel-cepstrum to MLSA digital filter coefficients
*
* @param mc
* mc
* @param b
* b
* @param m
* m
* @param a
* a
*/
public static void mc2b(double mc[], double b[], int m, double a) {
b[m] = mc[m];
for (m--; m >= 0; m--) {
b[m] = mc[m] - a * b[m + 1];
}
}
/**
* b2mc: transform MLSA digital filter coefficients to mel-cepstrum
*
* @param b
* b
* @param mc
* mc
* @param m
* m
* @param a
* a
*/
public static void b2mc(double b[], double mc[], int m, double a) {
double d = mc[m] = b[m];
for (int i = m--; i >= 0; i--) {
double o = b[i] + (a * d);
d = b[i];
mc[i] = o;
}
}
/**
* freqt: frequency transformation
*
* @param c1
* c1
* @param m1
* m1
* @param c2
* c2
* @param m2
* m2
* @param a
* a
*/
public static void freqt(double c1[], int m1, double c2[], int m2, double a) {
double b = 1 - a * a;
double freqt_buff[] = new double[(m2 + m2 + 2)]; /* used in freqt */
int g = m2 + 1; /* offset of freqt_buff */
for (int i = -m1; i <= 0; i++) {
if (0 <= m2)
freqt_buff[g + 0] = c1[-i] + a * (freqt_buff[0] = freqt_buff[g + 0]);
if (1 <= m2)
freqt_buff[g + 1] = b * freqt_buff[0] + a * (freqt_buff[1] = freqt_buff[g + 1]);
for (int j = 2; j <= m2; j++)
freqt_buff[g + j] = freqt_buff[j - 1] + a * ((freqt_buff[j] = freqt_buff[g + j]) - freqt_buff[g + j - 1]);
}
/* move memory */
System.arraycopy(freqt_buff, g, c2, 0, m2);
}
/**
* c2ir: The minimum phase impulse response is evaluated from the minimum phase cepstrum
*
* @param c
* c
* @param nc
* nc
* @param hh
* hh
* @param leng
* leng
*/
public static void c2ir(double c[], int nc, double hh[], int leng) {
hh[0] = Math.exp(c[0]);
for (int n = 1; n < leng; n++) {
double d = 0;
int upl = (n >= nc) ? nc - 1 : n;
for (int k = 1; k <= upl; k++)
d += k * c[k] * hh[n - k];
hh[n] = d / n;
}
}
/**
* b2en: functions for postfiltering
*
* @param b
* b
* @param m
* m
* @param a
* a
* @return en
*/
public static double b2en(double b[], int m, double a) {
double cep[], ir[];
int arrayLength = (m + 1) + 2 * IRLENG;
double[] spectrum2en_buff = new double[arrayLength];
cep = new double[arrayLength]; /* CHECK! these sizes!!! */
ir = new double[arrayLength];
b2mc(b, spectrum2en_buff, m, a);
/* freqt(vs->mc, m, vs->cep, vs->irleng - 1, -a); */
freqt(spectrum2en_buff, m, cep, IRLENG - 1, -a);
/* HTS_c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); */
c2ir(cep, IRLENG, ir, IRLENG);
double en = 0.0;
for (int i = 0; i < IRLENG; i++)
en += ir[i] * ir[i];
return en;
}
/**
* ignorm: inverse gain normalization
*
* @param c1
* c1
* @param c2
* c2
* @param m
* m
* @param ng
* ng
*/
public static void ignorm(double c1[], double c2[], int m, double ng) {
if (ng != 0.0) {
double k = Math.pow(c1[0], ng);
for (int i = m; i >= 1; i--)
c2[i] = k * c1[i];
c2[0] = (k - 1.0) / ng;
} else {
/* movem */
System.arraycopy(c1, 1, c2, 1, m - 1);
c2[0] = Math.log(c1[0]);
}
}
/**
* ignorm: gain normalization
*
* @param c1
* c1
* @param c2
* c2
* @param m
* m
* @param g
* g
*/
public static void gnorm(double c1[], double c2[], int m, double g) {
if (g != 0.0) {
double k = 1.0 + g * c1[0];
for (; m >= 1; m--)
c2[m] = c1[m] / k;
c2[0] = Math.pow(k, 1.0 / g);
} else {
/* movem */
System.arraycopy(c1, 1, c2, 1, m - 1);
c2[0] = Math.exp(c1[0]);
}
}
/**
* lsp2lpc: transform LSP to LPC. lsp[1..m] → a=lpc[0..m] a[0]=1.0
*
* @param lsp
* lsp
* @param a
* a
* @param m
* m
*/
public static void lsp2lpc(double lsp[], double a[], int m) {
int i, k, mh1, mh2, flag_odd;
double xx, xf, xff;
int p, q; /* offsets of lsp2lpc_buff */
int a0, a1, a2, b0, b1, b2; /* offsets of lsp2lpc_buff */
flag_odd = 0;
if (m % 2 == 0)
mh1 = mh2 = m / 2;
else {
mh1 = (m + 1) / 2;
mh2 = (m - 1) / 2;
flag_odd = 1;
}
double[] lsp2lpc_buff = new double[(5 * m + 6)];
int lsp2lpc_size = m;
/* offsets of lsp2lpcbuff */
p = m;
q = p + mh1;
a0 = q + mh2;
a1 = a0 + (mh1 + 1);
a2 = a1 + (mh1 + 1);
b0 = a2 + (mh1 + 1);
b1 = b0 + (mh2 + 1);
b2 = b1 + (mh2 + 1);
/* move lsp -> lsp2lpc_buff */
System.arraycopy(lsp, 1, lsp2lpc_buff, 0, m);
for (i = 0; i < mh1 + 1; i++)
lsp2lpc_buff[a0 + i] = 0.0;
for (i = 0; i < mh1 + 1; i++)
lsp2lpc_buff[a1 + i] = 0.0;
for (i = 0; i < mh1 + 1; i++)
lsp2lpc_buff[a2 + i] = 0.0;
for (i = 0; i < mh2 + 1; i++)
lsp2lpc_buff[b0 + i] = 0.0;
for (i = 0; i < mh2 + 1; i++)
lsp2lpc_buff[b1 + i] = 0.0;
for (i = 0; i < mh2 + 1; i++)
lsp2lpc_buff[b2 + i] = 0.0;
/* lsp filter parameters */
for (i = k = 0; i < mh1; i++, k += 2)
lsp2lpc_buff[p + i] = -2.0 * Math.cos(lsp2lpc_buff[k]);
for (i = k = 0; i < mh2; i++, k += 2)
lsp2lpc_buff[q + i] = -2.0 * Math.cos(lsp2lpc_buff[k + 1]);
/* impulse response of analysis filter */
xx = 1.0;
xf = xff = 0.0;
for (k = 0; k <= m; k++) {
if (flag_odd == 1) {
lsp2lpc_buff[a0 + 0] = xx;
lsp2lpc_buff[b0 + 0] = xx - xff;
xff = xf;
xf = xx;
} else {
lsp2lpc_buff[a0 + 0] = xx + xf;
lsp2lpc_buff[b0 + 0] = xx - xf;
xf = xx;
}
for (i = 0; i < mh1; i++) {
lsp2lpc_buff[a0 + i + 1] = lsp2lpc_buff[a0 + i] + lsp2lpc_buff[p + i] * lsp2lpc_buff[a1 + i]
+ lsp2lpc_buff[a2 + i];
lsp2lpc_buff[a2 + i] = lsp2lpc_buff[a1 + i];
lsp2lpc_buff[a1 + i] = lsp2lpc_buff[a0 + i];
}
for (i = 0; i < mh2; i++) {
lsp2lpc_buff[b0 + i + 1] = lsp2lpc_buff[b0 + i] + lsp2lpc_buff[q + i] * lsp2lpc_buff[b1 + i]
+ lsp2lpc_buff[b2 + i];
lsp2lpc_buff[b2 + i] = lsp2lpc_buff[b1 + i];
lsp2lpc_buff[b1 + i] = lsp2lpc_buff[b0 + i];
}
if (k != 0)
a[k - 1] = -0.5 * (lsp2lpc_buff[a0 + mh1] + lsp2lpc_buff[b0 + mh2]);
xx = 0.0;
}
for (i = m - 1; i >= 0; i--)
a[i + 1] = -a[i];
a[0] = 1.0;
}
/**
* gc2gc: generalized cepstral transformation
*
* @param c1
* c1
* @param m1
* m1
* @param g1
* g1
* @param c2
* c2
* @param m2
* m2
* @param g2
* g2
*/
public static void gc2gc(double c1[], int m1, double g1, double c2[], int m2, double g2) {
double[] gc2gc_buff = Arrays.copyOf(c1, m1 + 1);
c2[0] = gc2gc_buff[0];
for (int i = 1; i <= m2; i++) {
double ss1 = 0.0;
double ss2 = 0.0;
int min = m1 < i ? m1 : i - 1;
for (int k = 1; k <= min; k++) {
int mk = i - k;
double cc = gc2gc_buff[k] * c2[mk];
ss2 += k * cc;
ss1 += mk * cc;
}
if (i <= m1)
c2[i] = gc2gc_buff[i] + (g2 * ss2 - g1 * ss1) / i;
else
c2[i] = (g2 * ss2 - g1 * ss1) / i;
}
}
/**
* mgc2mgc: frequency and generalized cepstral transformation
*
* @param c1
* c1
* @param m1
* m1
* @param a1
* a1
* @param g1
* g1
* @param c2
* c2
* @param m2
* m2
* @param a2
* a2
* @param g2
* g2
*/
public static void mgc2mgc(double c1[], int m1, double a1, double g1, double c2[], int m2, double a2, double g2) {
if (a1 == a2) {
gnorm(c1, c1, m1, g1);
gc2gc(c1, m1, g1, c2, m2, g2);
ignorm(c2, c2, m2, g2);
} else {
double a = (a2 - a1) / (1 - a1 * a2);
freqt(c1, m1, c2, m2, a);
gnorm(c2, c2, m2, g1);
gc2gc(c2, m2, g1, c2, m2, g2);
ignorm(c2, c2, m2, g2);
}
}
/**
* lsp2mgc: transform LSP to MGC. lsp=C[0..m] mgc=C[0..m]
*
* @param lsp
* lsp
* @param mgc
* mgc
* @param m
* m
* @param alpha
* alpha
*/
public void lsp2mgc(double lsp[], double mgc[], int m, double alpha) {
/* lsp2lpc */
lsp2lpc(lsp, mgc, m); /* lsp starts in 1! lsp[1..m] --> mgc[0..m] */
if (use_log_gain)
mgc[0] = Math.exp(lsp[0]);
else
mgc[0] = lsp[0];
/* mgc2mgc */
ignorm(mgc, mgc, m, gamma);
for (int i = m; i >= 1; i--)
mgc[i] *= -stage;
mgc2mgc(mgc, m, alpha, gamma, mgc, m, alpha, gamma); /* input and output is in mgc=C */
}
/**
* mglsadff: sub functions for MGLSA filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param n
* n
* @param d
* d
* @return x
*/
public static double mglsadf(double x, double b[], int m, double a, int n, double d[]) {
for (int i = 0; i < n; i++)
x = mglsadff(x, b, m, a, d, (i * (m + 1)));
return x;
}
/**
* mglsadf: sub functions for MGLSA filter
*
* @param x
* x
* @param b
* b
* @param m
* m
* @param a
* a
* @param d
* d
* @param d_offset
* d_offset
* @return x
*/
private static double mglsadff(double x, double b[], int m, double a, double d[], int d_offset) {
double y = d[d_offset + 0] * b[1];
for (int i = 1; i < m; i++) {
d[d_offset + i] += a * (d[d_offset + i + 1] - d[d_offset + i - 1]);
y += d[d_offset + i] * b[i + 1];
}
x -= y;
for (int i = m; i > 0; i--)
d[d_offset + i] = d[d_offset + i - 1];
d[d_offset + 0] = a * d[d_offset + 0] + (1 - a * a) * x;
return x;
}
/**
* posfilter: postfilter for mel-cepstrum. It uses alpha and beta defined in HMMData
*
* @param mgc
* mgc
* @param m
* m
* @param alpha
* alpha
* @param beta
* beta
*/
public static void postfilter_mgc(double mgc[], int m, double alpha, double beta) {
if (beta > 0.0 && m > 1) {
double[] postfilter_buff = new double[m + 1];
mc2b(mgc, postfilter_buff, m, alpha);
double e1 = b2en(postfilter_buff, m, alpha);
postfilter_buff[1] -= beta * alpha * mgc[2];
for (int k = 2; k < m; k++)
postfilter_buff[k] *= (1.0 + beta);
double e2 = b2en(postfilter_buff, m, alpha);
postfilter_buff[0] += Math.log(e1 / e2) / 2;
b2mc(postfilter_buff, mgc, m, alpha);
}
}
public static double[] genPulseFromFourierMag(HTSPStream mag, int n, double f0) {
return genPulseFromFourierMag(mag.getParVec(n), f0);
}
/**
* Generate one pitch period from Fourier magnitudes
*
* @param mag
* mag
* @param f0
* f0
* @return pulse
*/
public static double[] genPulseFromFourierMag(double[] mag, double f0) {
int numHarm = mag.length;
int currentF0 = (int) Math.round(f0);
int T;
if (currentF0 < 512)
T = 512;
else
T = 1024;
int T2 = 2 * T;
/* since is FFT2 no aperiodicFlag or jitter of 25% is applied */
/* get the pulse */
double[] pulse = new double[T];
double[] real = new double[T2];
double[] imag = new double[T2];
/* copy Fourier magnitudes (Wai C. Chu "Speech Coding algorithms foundation and evolution of standardized coders" pg. 460) */
real[0] = real[T] = 0.0; /* DC component set to zero */
for (int i = 1; i <= numHarm; i++) {
real[i] = real[T - i] = real[T + i] = real[T2 - i] = mag[i - 1]; /* Symetric extension */
imag[i] = imag[T - i] = imag[T + i] = imag[T2 - i] = 0.0;
}
for (int i = (numHarm + 1); i < (T - numHarm); i++) { /* Default components set to 1.0 */
real[i] = real[T - i] = real[T + i] = real[T2 - i] = 1.0;
imag[i] = imag[T - i] = imag[T + i] = imag[T2 - i] = 0.0;
}
/* Calculate inverse Fourier transform */
FFT.transform(real, imag, true);
/* circular shift and normalise multiplying by sqrt(F0) */
double sqrt_f0 = Math.sqrt(currentF0);
for (int i = 0; i < T; i++)
pulse[i] = real[(i - numHarm) % T] * sqrt_f0;
return pulse;
}
private void circularShift(double y[], int T, int n) {
double x[] = new double[T];
for (int i = 0; i < T; i++)
x[i] = y[modShift(i - n, T)];
for (int i = 0; i < T; i++)
y[i] = x[i];
}
private int modShift(int n, int N) {
if (n < 0)
while (n < 0)
n = n + N;
else
while (n >= N)
n = n - N;
return n;
}
/**
* Stand alone testing reading parameters from files in SPTK format
*
* @param args
* args
* @throws IOException
* IOException
* @throws InterruptedException
* InterruptedException
* @throws Exception
* Exception
*/
public static void main1(String[] args) throws IOException, InterruptedException, Exception {
/* configure log info */
// org.apache.log4j.BasicConfigurator.configure();
HMMData htsData = new HMMData();
HTSPStream lf0Pst, mcepPst, strPst, magPst;
boolean[] voiced = null;
LEDataInputStream lf0Data, mcepData, strData, magData;
String lf0File, mcepFile, strFile, magFile, outFile, residualFile;
String voiceName, voiceConfig, outDir, voiceExample, hmmTrainDir;
String MaryBase = "/project/mary/marcela/openmary/";
outDir = "/project/mary/marcela/openmary/tmp/";
outFile = outDir + "tmp.wav";
// Voice
/*
* voiceName = "hsmm-slt"; voiceConfig = "en_US-hsmm-slt.config"; voiceExample = "cmu_us_arctic_slt_a0001"; hmmTrainDir =
* "/project/mary/marcela/HMM-voices/HTS-demo_CMU-ARCTIC-SLT/"; // The directory where the voice was trained
*/
voiceName = "hsmm-ot";
voiceConfig = "tr-hsmm-ot.config";
voiceExample = "ot0010";
hmmTrainDir = "/project/mary/marcela/HMM-voices/turkish/"; // The directory where the voice was trained
htsData.initHMMData(voiceName, MaryBase, voiceConfig);
htsData.setUseMixExc(true);
htsData.setUseFourierMag(true); /* use Fourier magnitudes for pulse generation */
/* parameters extracted from real data with SPTK and snack */
lf0File = hmmTrainDir + "data/lf0/" + voiceExample + ".lf0";
mcepFile = hmmTrainDir + "data/mgc/" + voiceExample + ".mgc";
strFile = hmmTrainDir + "data/str/" + voiceExample + ".str";
magFile = hmmTrainDir + "data/mag/" + voiceExample + ".mag";
int mcepVsize = htsData.getCartTreeSet().getMcepVsize();
int strVsize = htsData.getCartTreeSet().getStrVsize();
int lf0Vsize = htsData.getCartTreeSet().getLf0Stream();
int magVsize = htsData.getCartTreeSet().getMagVsize();
int totalFrame = 0;
int lf0VoicedFrame = 0;
float fval;
int i, j;
lf0Data = new LEDataInputStream(new BufferedInputStream(new FileInputStream(lf0File)));
/* First i need to know the size of the vectors */
try {
while (true) {
fval = lf0Data.readFloat();
totalFrame++;
if (fval > 0)
lf0VoicedFrame++;
}
} catch (EOFException e) {
}
lf0Data.close();
/* CHECK: I do not know why mcep has totalframe-2 frames less than lf0 and str ??? */
totalFrame = totalFrame - 2;
System.out.println("Total number of Frames = " + totalFrame);
voiced = new boolean[totalFrame];
/* Initialise HTSPStream-s */
lf0Pst = new HTSPStream(lf0Vsize, totalFrame, HMMData.FeatureType.LF0, 0);
mcepPst = new HTSPStream(mcepVsize, totalFrame, HMMData.FeatureType.MGC, 0);
strPst = new HTSPStream(strVsize, totalFrame, HMMData.FeatureType.STR, 0);
magPst = new HTSPStream(magVsize, totalFrame, HMMData.FeatureType.MAG, 0);
/* load lf0 data */
/* for lf0 i just need to load the voiced values */
lf0VoicedFrame = 0;
lf0Data = new LEDataInputStream(new BufferedInputStream(new FileInputStream(lf0File)));
for (i = 0; i < totalFrame; i++) {
fval = lf0Data.readFloat();
// lf0Pst.setPar(i, 0, fval);
if (fval < 0)
voiced[i] = false;
else {
voiced[i] = true;
lf0Pst.setPar(lf0VoicedFrame, 0, fval);
lf0VoicedFrame++;
}
}
lf0Data.close();
/* load mgc data */
mcepData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(mcepFile)));
for (i = 0; i < totalFrame; i++) {
for (j = 0; j < mcepPst.getOrder(); j++)
mcepPst.setPar(i, j, mcepData.readFloat());
}
mcepData.close();
/* load str data */
strData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(strFile)));
for (i = 0; i < totalFrame; i++) {
for (j = 0; j < strPst.getOrder(); j++)
strPst.setPar(i, j, strData.readFloat());
}
strData.close();
/* load mag data */
magData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(magFile)));
for (i = 0; i < totalFrame; i++) {
for (j = 0; j < magPst.getOrder(); j++)
magPst.setPar(i, j, magData.readFloat());
// System.out.println("i:" + i + " f0=" + Math.exp(lf0Pst.getPar(i, 0)) + " mag(1)=" + magPst.getPar(i, 0) +
// " str(1)=" + strPst.getPar(i, 0) );
}
magData.close();
AudioFormat af = getHTSAudioFormat(htsData);
double[] audio_double = null;
HTSVocoder par2speech = new HTSVocoder();
// par2speech.setUseLpcVocoder(true);
audio_double = par2speech.htsMLSAVocoder(lf0Pst, mcepPst, strPst, magPst, voiced, htsData, null);
// audio_double = par2speech.htsMLSAVocoder_residual(htsData, mcepPst, resFile);
long lengthInSamples = (audio_double.length * 2) / (af.getSampleSizeInBits() / 8);
par2speech.logger.debug("length in samples=" + lengthInSamples);
/* Normalise the signal before return, this will normalise between 1 and -1 */
double MaxSample = MathUtils.getAbsMax(audio_double);
for (i = 0; i < audio_double.length; i++)
audio_double[i] = 0.3 * (audio_double[i] / MaxSample);
DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af);
File fileOut = new File(outFile);
System.out.println("saving to file: " + outFile);
if (AudioSystem.isFileTypeSupported(AudioFileFormat.Type.WAVE, oais)) {
AudioSystem.write(oais, AudioFileFormat.Type.WAVE, fileOut);
}
System.out.println("Calling audioplayer:");
AudioPlayer player = new AudioPlayer(fileOut);
player.start();
player.join();
System.out.println("audioplayer finished...");
}
/**
* Stand alone vocoder reading parameters from files in SPTK format, parameters in args[] array in the following order:
*
* The type of spectrum parameters is set through the parameters gamma and alpha
*
*
* @param args
*
*
* example iput parameters:
*
* 0 0.45 0 0.0 16000 80 cmu_us_arctic_slt_a0001.mgc 75 cmu_us_arctic_slt_a0001.lf0 3 vocoder_out.wav
* cmu_us_arctic_slt_a0001.str 15 mix_excitation_filters.txt 5 48 cmu_us_arctic_slt_a0001.mag 30
*
* example input parameters without mixed excitation:
*
* 0 0.45 0 0.0 16000 80 cmu_us_arctic_slt_a0001.mgc 75 cmu_us_arctic_slt_a0001.lf0 3 vocoder_out.wav
* @throws IOException
* IOException
* @throws InterruptedException
* InterruptedException
* @throws Exception
* Exception
* */
public static void htsMLSAVocoderCommand(String[] args) throws IOException, InterruptedException, Exception {
HMMData htsData = new HMMData();
HTSPStream lf0Pst, mcepPst, strPst = null, magPst = null;
boolean[] voiced = null;
LEDataInputStream lf0Data, mcepData, strData, magData;
String lf0File, mcepFile, strFile = "", magFile = "", outDir, outFile;
int mcepVsize, lf0Vsize, strVsize = 0, magVsize = 0;
// -----------------------------------
// Values for FEMALE:
// LOUD:
float f0LoudFemale = 0.01313791f;
float strLoudFemale[] = { -0.002995137f, -0.042511885f, 0.072285673f, 0.127030178f, 0.006603170f };
float magLoudFemale[] = { 0.0417336550f, 0.0002531457f, -0.0436839922f, -0.0335192265f, -0.0217501786f, -0.0166272925f,
-0.0424825309f, -0.0460119758f, -0.0307114900f, -0.0327369397f };
float mcepLoudFemale[] = { -0.245401838f, -0.062825965f, -0.360973095f, 0.117120506f, 0.917223265f, 0.138920770f,
0.338553265f, -0.004857140f, 0.285192007f, -0.358292740f, -0.062907335f, -0.008040502f, 0.029470562f,
-0.485079992f, -0.006727651f, -1.313869583f, -0.353797651f, 0.797097747f, -0.164614609f, -0.311173881f,
-0.205134527f, -0.478116992f, -0.311340181f, -1.485855332f, -0.045632626f };
// SOFT:
float f0SoftFemale = 0.3107256f;
float strSoftFemale[] = { 0.22054621f, 0.11091616f, 0.06378487f, 0.02110654f, -0.05118725f };
float magSoftFemale[] = { 0.5747024f, 0.3248238f, 0.2356782f, 0.2441387f, 0.2702851f, 0.2895966f, 0.2437654f, 0.2959747f,
0.2910529f, 0.2508167f };
float mcepSoftFemale[] = { -0.103318169f, 0.315698439f, 0.170000964f, 0.223589719f, 0.262139649f, -0.062646758f,
-4.998160141f, 0.008026212f, 1.742740835f, 1.990719666f, 0.548177521f, 0.999093856f, 0.262868363f, 1.755019406f,
0.330058590f, -5.241305159f, -0.021005177f, -5.890942393f, 0.344385084f, 0.242179454f, 0.200936671f,
-1.630683357f, 0.110674201f, -53.525043676f, -0.223682764f };
// -----------------------------------
// Values for MALE:
// LOUD:
float f0LoudMale = -0.08453168f;
float strLoudMale[] = { 0.07092900f, 0.41149292f, 0.24479925f, 0.01326785f, -0.01517731f };
float magLoudMale[] = { -0.21923620f, -0.11031120f, -0.02786084f, -0.10640244f, -0.12020442f, -0.08508762f, -0.08171423f,
-0.08000552f, -0.07291968f, -0.09478534f };
float mcepLoudMale[] = { 0.15335238f, 0.30880292f, -0.22922052f, -0.01116095f, 1.04088351f, -0.31693632f, -19.36510752f,
-0.12210441f, 0.81743415f, -0.19799409f, 0.44572112f, -0.24845725f, -1.39545409f, -0.88788491f, 8.83006358f,
-1.26623882f, 0.52428102f, -1.02615700f, -0.28092043f, -0.82543015f, 0.33081815f, 0.39498874f, 0.20100945f,
0.60890790f, -0.37892217f };
// SOFT:
float f0SoftMale = 0.05088677f;
float strSoftMale[] = { 0.07595702f, 0.02348965f, -0.02038628f, -0.08572970f, -0.06090386f };
float magSoftMale[] = { 0.08869109f, 0.05517088f, 0.08902098f, 0.09263865f, 0.04866824f, 0.04554406f, 0.04937004f,
0.05082076f, 0.04988959f, 0.03459440f };
float mcepSoftMale[] = { 0.098129393f, 0.124686819f, 0.195709008f, -0.007066379f, -1.795620578f, 0.089982916f,
15.371711686f, -0.051023831f, -0.213521945f, 0.009725292f, 0.361488718f, 0.118609995f, 1.794143134f,
0.100130942f, 0.005999542f, -0.593128934f, -0.165385304f, 0.101705681f, 0.175534153f, 0.049246302f, 0.009530379f,
-0.272557042f, -0.043030771f, 0.158694874f, 0.099107970f };
float f0Trans = 0f;
float strTrans[] = null;
float magTrans[] = null;
float mcepTrans[] = null;
// set values that the vocoder needs
// Type of features:
int ind = 0;
htsData.setStage(Integer.parseInt(args[ind++])); // sets gamma
htsData.setAlpha(Float.parseFloat(args[ind++])); // set alpha
if (args[ind++].contentEquals("1"))
htsData.setUseLogGain(true); // use log gain
else
htsData.setUseLogGain(false);
htsData.setBeta(Float.parseFloat(args[ind++])); // set beta: for postfiltering
htsData.setRate(Integer.parseInt(args[ind++])); // rate
htsData.setFperiod(Integer.parseInt(args[ind++])); // period
/* parameters extracted from real data with SPTK and snack */
mcepFile = args[ind++];
mcepVsize = Integer.parseInt(args[ind++]);
lf0File = args[ind++];
lf0Vsize = Integer.parseInt(args[ind++]);
// output wav file
outFile = args[ind++];
// Optional:
// if using mixed excitation
if (args.length > (ind + 1)) {
htsData.setUseMixExc(true);
strFile = args[ind++];
strVsize = Integer.parseInt(args[ind++]);
FileInputStream mixedFiltersStream = new FileInputStream(args[ind++]);
htsData.setNumFilters(Integer.parseInt(args[ind++]));
htsData.readMixedExcitationFilters(mixedFiltersStream);
htsData.setPdfStrStream(null);
} else {
htsData.setUseMixExc(false);
}
// Optional:
// if using Fourier magnitudes in mixed excitation
if (args.length > (ind + 1)) {
htsData.setUseFourierMag(true);
magFile = args[ind++];
magVsize = Integer.parseInt(args[ind++]);
htsData.setPdfMagStream(null);
} else {
htsData.setUseFourierMag(false);
}
// last argument true or false to play the file
boolean play = Boolean.parseBoolean(args[ind++]);
boolean trans = true;
if (args[ind].contentEquals("loud")) {
f0Trans = f0LoudFemale;
strTrans = strLoudFemale;
magTrans = magLoudFemale;
mcepTrans = mcepLoudFemale;
System.out.println("Generating loud voice");
} else if (args[ind].contentEquals("soft")) {
f0Trans = f0SoftFemale;
strTrans = strSoftFemale;
magTrans = magSoftFemale;
mcepTrans = mcepSoftFemale;
System.out.println("Generating soft voice");
} else {
trans = false;
System.out.println("Generating modal voice");
}
// Change these for voice effects:
// [min][max]
htsData.setF0Std(1.0); // variable for f0 control, multiply f0 [1.0][0.0--5.0]
htsData.setF0Mean(0.0); // variable for f0 control, add f0 [0.0][0.0--100.0]
int totalFrame = 0;
int lf0VoicedFrame = 0;
float fval;
int i, j;
lf0Data = new LEDataInputStream(new BufferedInputStream(new FileInputStream(lf0File)));
/* First i need to know the size of the vectors */
File lf0 = new File(lf0File);
long lengthLf0 = lf0.length(); // Get the number of bytes in the file
lengthLf0 = lengthLf0 / ((lf0Vsize / 3) * 4); // 4 bytes per float
File mcep = new File(mcepFile);
long lengthMcep = mcep.length();
lengthMcep = lengthMcep / ((mcepVsize / 3) * 4);
int numSize = 2;
long lengthStr;
if (htsData.getUseMixExc()) {
File str = new File(strFile);
lengthStr = str.length();
lengthStr = lengthStr / ((strVsize / 3) * 4);
numSize++;
} else
lengthStr = 0;
long lengthMag;
if (htsData.getUseFourierMag()) {
File mag = new File(magFile);
lengthMag = mag.length();
lengthMag = lengthMag / ((magVsize / 3) * 4);
numSize++;
} else
lengthMag = 0;
float sizes[] = new float[numSize];
int n = 0;
sizes[n++] = lengthMcep;
sizes[n++] = lengthLf0;
if (lengthStr > 0)
sizes[n++] = lengthStr;
if (lengthMag > 0)
sizes[n++] = lengthMag;
// choose the lowest
// float sizes[] = {lengthLf0, lengthMcep, lengthStr, lengthMag};
totalFrame = (int) MathUtils.getMin(sizes);
System.out.println("Total number of Frames = " + totalFrame);
voiced = new boolean[totalFrame];
/* Initialise HTSPStream-s */
lf0Pst = new HTSPStream(lf0Vsize, totalFrame, HMMData.FeatureType.LF0, 0);
mcepPst = new HTSPStream(mcepVsize, totalFrame, HMMData.FeatureType.MGC, 0);
/* load lf0 data */
/* for lf0 i just need to load the voiced values */
lf0VoicedFrame = 0;
lf0Data = new LEDataInputStream(new BufferedInputStream(new FileInputStream(lf0File)));
for (i = 0; i < totalFrame; i++) {
fval = lf0Data.readFloat();
// lf0Pst.setPar(i, 0, fval);
if (fval < 0)
voiced[i] = false;
else {
voiced[i] = true;
// apply here the change to loud
if (trans) {
fval = (float) Math.exp(fval);
fval = fval + (fval * f0Trans);
fval = (float) Math.log(fval);
}
lf0Pst.setPar(lf0VoicedFrame, 0, fval);
lf0VoicedFrame++;
}
}
lf0Data.close();
/* load mgc data */
mcepData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(mcepFile)));
for (i = 0; i < totalFrame; i++) {
for (j = 0; j < mcepPst.getOrder(); j++) {
// apply here the change to loud
fval = mcepData.readFloat();
if (trans & j < 4)
fval = fval + (fval * mcepTrans[j]);
mcepPst.setPar(i, j, fval);
}
}
mcepData.close();
/* load str data */
if (htsData.getUseMixExc()) {
strPst = new HTSPStream(strVsize, totalFrame, HMMData.FeatureType.STR, 0);
strData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(strFile)));
for (i = 0; i < totalFrame; i++) {
for (j = 0; j < strPst.getOrder(); j++) {
// apply here the change to loud/soft
fval = strData.readFloat();
if (trans)
fval = fval + (fval * strTrans[j]);
strPst.setPar(i, j, fval);
}
}
strData.close();
}
/* load mag data */
n = 0;
if (htsData.getUseFourierMag()) {
magPst = new HTSPStream(magVsize, totalFrame, HMMData.FeatureType.MAG, 0);
magData = new LEDataInputStream(new BufferedInputStream(new FileInputStream(magFile)));
for (i = 0; i < totalFrame; i++) {
// System.out.print(n + " : ");
for (j = 0; j < magPst.getOrder(); j++) {
n++;
fval = magData.readFloat();
if (trans)
fval = fval + (fval * magTrans[j]);
magPst.setPar(i, j, fval);
// System.out.format("mag(%d,%d)=%.2f ",i, j, magPst.getPar(i, j) );
}
// System.out.println();
}
magData.close();
}
AudioFormat af = getHTSAudioFormat(htsData);
double[] audio_double = null;
HTSVocoder par2speech = new HTSVocoder();
// par2speech.setUseLpcVocoder(true);
// audio_double = par2speech.htsMLSAVocoder_residual(htsData, mcepPst, resFile);
audio_double = par2speech.htsMLSAVocoder(lf0Pst, mcepPst, strPst, magPst, voiced, htsData, null);
long lengthInSamples = (audio_double.length * 2) / (af.getSampleSizeInBits() / 8);
logger.debug("length in samples=" + lengthInSamples);
/* Normalise the signal before return, this will normalise between 1 and -1 */
double MaxSample = MathUtils.getAbsMax(audio_double);
for (i = 0; i < audio_double.length; i++)
audio_double[i] = (audio_double[i] / MaxSample);
DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af);
File fileOut = new File(outFile);
System.out.println("saving to file: " + outFile);
if (AudioSystem.isFileTypeSupported(AudioFileFormat.Type.WAVE, oais)) {
AudioSystem.write(oais, AudioFileFormat.Type.WAVE, fileOut);
}
if (play) {
System.out.println("Calling audioplayer:");
AudioPlayer player = new AudioPlayer(fileOut);
player.start();
player.join();
System.out.println("audioplayer finished...");
}
}
public static void main(String[] args) throws IOException, InterruptedException, Exception {
/* configure log info */
org.apache.log4j.BasicConfigurator.configure();
// copy synthesis: requires a hmm voice
// main1(args);
// copy synthesis: requires parameters, see description
// example of parameters:
/*
* 0 0.45 0 16000 80 /project/mary/marcela/HMM-voices/roger/hts/data/mgc/roger_5739.mgc 75
* /project/mary/marcela/HMM-voices/roger/hts/data/lf0/roger_5739.lf0 3
* /project/mary/marcela/HMM-voices/roger/vocoder_out.wav
* /project/mary/marcela/HMM-voices/roger/hts/data/str/roger_5739.str 15
* /project/mary/marcela/HMM-voices/roger/hts/data/filters/mix_excitation_filters.txt 5 48
* /project/mary/marcela/HMM-voices/roger/hts/data/mag/roger_5739.mag 30
*
* example input parameters without mixed excitation: 0 0.45 0 16000 80
* /project/mary/marcela/HMM-voices/roger/hts/data/mgc/roger_5739.mgc 75
* /project/mary/marcela/HMM-voices/roger/hts/data/lf0/roger_5739.lf0 3
* /project/mary/marcela/HMM-voices/roger/vocoder_out1.wav
*/
/*
* String topic = "pru013"; String path = "/project/mary/marcela/HMM-voices/prudence/hts/data/"; // with mixed excitation
* String args1[] = {"0", "0.45", "0", "16000", "80", path + "mgc/" + topic + ".mgc", "75", path + "lf0/" + topic +
* ".lf0", "3", path + "vocoder/" + topic + ".wav", path + "str/" + topic + ".str", "15", path +
* "filters/mix_excitation_filters.txt", "5", "48", path + "mag/" + topic + ".mag", "30", "true"};
*
* // without mixed excitation String args2[] = {"0", "0.45", "0", "16000", "80", path + "mgc/" + topic + ".mgc", "75",
* path + "lf0/" + topic + ".lf0", "3", path + "/" + topic + ".wav", "true"};
*
* HTSVocoder vocoder = new HTSVocoder(); vocoder.htsMLSAVocoderCommand(args2);
*/
/*
* String path = "/project/mary/marcela/HMM-voices/BITS/bits1/hts/data/"; String args3[] = {"0", "0.42", "0.05", "0.3",
* "16000", "80", path + "mgc/US10010046_0.mgc", "75", path + "lf0-100-270/US10010046_0.lf0", "3", path +
* "vocoder_out-100-270.wav", path + "str-100-270/US10010046_0.str", "15", path + "filters/mix_excitation_filters.txt",
* "5", "true"}; HTSVocoder vocoder = new HTSVocoder(); vocoder.htsMLSAVocoderCommand(args3);
*/
/*
* String path = "/project/mary/marcela/quality_parameters/necadbs/hts/data/"; String args3[] = {"0", "0.42", "0.05",
* "0.15", "16000", "80", path + "mgc/modal0001.mgc", "75", path + "lf0/modal0001.lf0", "3", path +
* "vocoder_out-modal-soft.wav", path + "str/soft0001.str", "15", path + "filters/mix_excitation_filters.txt", "5",
* "true"}; HTSVocoder vocoder = new HTSVocoder(); vocoder.htsMLSAVocoderCommand(args3);
*/
/*
* String path = "/project/mary/marcela/HMM-voices/arctic_slt/hts/data/"; String fileName = "modal0002"; //String fileName
* = "de_0001"; String args4[] = {"0", "0.42", "0.05", "0.25", "16000", "80", path + "mgc/" + fileName + ".mgc", "75",
* path + "lf0/" + fileName + ".lf0", "3", path + "vocoder/" + fileName + "_vocoder_soft.wav", path + "str/" + fileName +
* ".str", "15", path + "filters/mix_excitation_filters.txt", "5", path + "mag/" + fileName + ".mag", "30", "true",
* "soft"}; HTSVocoder vocoder = new HTSVocoder(); vocoder.htsMLSAVocoderCommand(args4);
*/
/* Use this for running HTSVocoder for a list, see vocoderList for the parameters */
/*
* HTSVocoder vocoder = new HTSVocoder(); vocoder.vocoderList(args);
*/
}
public static void vocoderList(String[] args) throws IOException, InterruptedException, Exception {
// String path = "/project/mary/marcela/HMM-voices/SEMAINE/prudence/hts/data/";
// String path = "/project/mary/marcela/HMM-voices/arctic_test/hts/data/";
// String path = "/project/mary/marcela/HMM-voices/SEMAINE/spike/hts/data/";
// String path = "/project/mary/marcela/HMM-voices/arctic_slt/hts/data/";
// String path = "/project/mary/marcela/HMM-voices/BITS/bits1/hts/data/";
String path = "/project/mary/marcela/quality_parameters/necadbs/hts/data/";
File outDir = new File(path + "vocoder");
if (!outDir.exists())
outDir.mkdir();
File directory = new File(path + "raw");
String files[] = FileUtils.listBasenames(directory, ".raw");
// the output will be in path/vocoder directory, it has to be created beforehand
for (int i = 0; i < files.length; i++) {
System.out.println("file: " + files[i]);
// MGC stage=0.0 alpha=0.42 logGain=0 (false)
// MGC-LSP stage=3.0 alpha=0.42 loggain=1 (true)
/*
* String args1[] = {"0", "0.42", "0", "0.15", "16000", "80", path + "mgc/" + files[i] + ".mgc", "75", path + "lf0/" +
* files[i] + ".lf0", "3", path + "vocoder/" + files[i] + ".wav", path + "str/" + files[i] + ".str", "15", path +
* "filters/mix_excitation_filters.txt", "5", path + "mag/" + files[i] + ".mag", "30", "true"}; // the last true/false
* is for playing or not the generated file
*/
// without Fourier magnitudes
String args1[] = { "0", "0.42", "0.05", "0.15", "16000", "80", path + "mgc/" + files[i] + ".mgc", "75",
path + "lf0/" + files[i] + ".lf0", "3", path + "vocoder/" + files[i] + ".wav",
path + "str/" + files[i] + ".str", "15", path + "filters/mix_excitation_filters.txt", "5", "true" }; // the
// last
// true/false
// is
// for
// playing
// or
// not
// the
// generated
// file
// without Mixed excitation and Fourier magnitudes
/*
* String args1[] = {"0", "0.42", "0", "0.0", "16000", "80", path + "mgc/" + files[i] + ".mgc", "75", path + "lf0/" +
* files[i] + ".lf0", "3", path + "vocoder/" + files[i] + ".wav", "true"}; // the last true/false is for playing or
* not the generated file
*/
htsMLSAVocoderCommand(args1);
}
}
protected class HTSVocoderDataProducer extends ProducingDoubleDataSource {
private static final double INITIAL_MAX_AMPLITUDE = 17000.;
// Values used by the synthesis thread
private HTSPStream lf0Pst;
private HTSPStream mcepPst;
private HTSPStream strPst;
private HTSPStream magPst;
private boolean[] voiced;
private HMMData htsData;
public HTSVocoderDataProducer(int audioSize, HTSParameterGeneration pdf2par, HMMData htsData) {
super(audioSize, new AmplitudeNormalizer(INITIAL_MAX_AMPLITUDE));
lf0Pst = pdf2par.getlf0Pst();
mcepPst = pdf2par.getMcepPst();
strPst = pdf2par.getStrPst();
magPst = pdf2par.getMagPst();
voiced = pdf2par.getVoicedArray();
this.htsData = htsData;
}
public void run() {
try {
htsMLSAVocoder(lf0Pst, mcepPst, strPst, magPst, voiced, htsData, this);
putEndOfStream();
} catch (Exception e) {
logger.error("Cannot vocode", e);
}
}
}
} /* class HTSVocoder */