diff --git a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs index 3cc20c0d7..d50b30007 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerPitch.cs @@ -27,6 +27,8 @@ public class DsPitch : IDisposable DiffSingerSpeakerEmbedManager speakerEmbedManager; const string PEXP = DiffSingerUtils.PEXP; + public float FrameMs => frameMs; + public DsPitch(string rootPath) { this.rootPath = rootPath; @@ -107,7 +109,7 @@ int PhonemeTokenize(string phoneme){ return token; } - public RenderPitchResult Process(RenderPhrase phrase){ + public RenderPitchResult Process(RenderPhrase phrase, HashSet? retakeNoteIndexes = null, float[]? existingPitch = null){ var startMs = phrase.phones[0].positionMs - DiffSingerUtils.GetHeadMs(frameMs); int headFrames = DiffSingerUtils.headFrames; int tailFrames = DiffSingerUtils.tailFrames; @@ -184,21 +186,29 @@ public RenderPitchResult Process(RenderPhrase phrase){ var noteDurMsList = new List(); var noteMidiList = new List(); var noteRestList = new List(); + //paddedToRealNoteIndex is kept in lockstep with noteDurMsList so the retake + //frame mask can map each padded segment to the real note it belongs to. + //Gap-rest segments inserted below follow the preceding real note. + var paddedToRealNoteIndex = new List(); //Head padding noteDurMsList.Add(Math.Max(0, phrase.notes[0].positionMs - startMs)); noteMidiList.Add(phrase.notes[0].adjustedTone); noteRestList.Add(true); + paddedToRealNoteIndex.Add(0); double prevNoteEndMs = phrase.notes[0].positionMs; - foreach (var note in phrase.notes) { + for (int realIdx = 0; realIdx < phrase.notes.Length; realIdx++) { + var note = phrase.notes[realIdx]; double gapMs = note.positionMs - prevNoteEndMs; if (gapMs > 0) { - //Insert a rest note for the gap + //Insert a rest note for the gap; associate it with the previous real note noteDurMsList.Add(gapMs); noteMidiList.Add(note.adjustedTone); noteRestList.Add(true); + paddedToRealNoteIndex.Add(realIdx - 1); } noteDurMsList.Add(note.durationMs); noteMidiList.Add(note.adjustedTone); + paddedToRealNoteIndex.Add(realIdx); //Slur notes follow the previous note's rest status if (note.lyric.StartsWith("+")) { noteRestList.Add(noteRestList[^1]); @@ -217,6 +227,7 @@ public RenderPitchResult Process(RenderPhrase phrase){ noteDurMsList.Add(DiffSingerUtils.GetTailMs(frameMs)); noteMidiList.Add(phrase.notes[^1].adjustedTone); noteRestList.Add(true); + paddedToRealNoteIndex.Add(phrase.notes.Length - 1); //Set tone for each rest group using nearest non-rest note var note_rest = noteRestList; @@ -251,6 +262,13 @@ public RenderPitchResult Process(RenderPhrase phrase){ .ToList(); var pitch = Enumerable.Repeat(60f, totalFrames).ToArray(); var retake = Enumerable.Repeat(true, totalFrames).ToArray(); + if (retakeNoteIndexes != null && existingPitch != null) { + retake = DiffSingerRetake.BuildRetakeFrameMask( + note_dur, paddedToRealNoteIndex, retakeNoteIndexes, totalFrames); + for (int i = 0; i < totalFrames && i < existingPitch.Length; i++) { + pitch[i] = existingPitch[i]; + } + } var pitchInputs = new List(); pitchInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out)); pitchInputs.Add(NamedOnnxValue.CreateFromTensor("note_midi", @@ -322,14 +340,16 @@ public RenderPitchResult Process(RenderPhrase phrase){ .Select(i=>(float)phrase.timeAxis.MsPosToTickPos(startMs + i*frameMs) - phrase.position) .Append((float)phrase.duration + 1) .ToArray(), - tones = pitch_out.Append(pitch_out[^1]).ToArray() + tones = pitch_out.Append(pitch_out[^1]).ToArray(), + retakeMask = retakeNoteIndexes != null ? retake.Append(retake[^1]).ToArray() : null, }; }else{ return new RenderPitchResult{ ticks = Enumerable.Range(0,totalFrames) .Select(i=>(float)phrase.timeAxis.MsPosToTickPos(startMs + i*frameMs) - phrase.position) .ToArray(), - tones = pitch_out + tones = pitch_out, + retakeMask = retakeNoteIndexes != null ? retake : null, }; } } diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs index 7844eea55..08739df57 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs @@ -508,6 +508,38 @@ public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase) { } } + public RenderPitchResult LoadRenderedPitch(RenderPhrase phrase, HashSet selectedNotePositions) { + if (!Preferences.Default.DiffSingerLocalRetaking) { + return LoadRenderedPitch(phrase); + } + DiffSingerSinger singer = (DiffSingerSinger) phrase.singer; + if (!singer.HasPitchPredictor) { + throw new Exception("This singer has no pitch predictor."); + } + var pitchPredictor = singer.getPitchPredictor()!; + var noteRelativePositions = new int[phrase.notes.Length]; + for (int i = 0; i < phrase.notes.Length; i++) { + noteRelativePositions[i] = phrase.notes[i].position; + } + var retakeNoteIndexes = DiffSingerRetake.MapSelectedPositionsToNoteIndexes( + phrase.position, noteRelativePositions, selectedNotePositions); + if (retakeNoteIndexes.Count == 0 || retakeNoteIndexes.Count == phrase.notes.Length) { + lock (pitchPredictor) { + return pitchPredictor.Process(phrase); + } + } + var frameMs = pitchPredictor.FrameMs; + int headFrames = DiffSingerUtils.headFrames; + int tailFrames = DiffSingerUtils.tailFrames; + var ph_dur = DiffSingerUtils.PaddedPhoneDurations(phrase, frameMs, headFrames, tailFrames); + int totalFrames = ph_dur.Sum(); + var existingPitch = DiffSingerUtils.SampleCurve(phrase, phrase.pitches, 0, frameMs, totalFrames, headFrames, tailFrames, + x => x * 0.01).Select(f => (float)f).ToArray(); + lock (pitchPredictor) { + return pitchPredictor.Process(phrase, retakeNoteIndexes, existingPitch); + } + } + public List LoadRenderedRealCurves(RenderPhrase phrase) { if (!Preferences.Default.DiffSingerTensorCache) { throw new Exception("Please enable DiffSinger tensor cache and re-render the phrase to display correct base curves."); diff --git a/OpenUtau.Core/DiffSinger/DiffSingerRetake.cs b/OpenUtau.Core/DiffSinger/DiffSingerRetake.cs new file mode 100644 index 000000000..d5787906b --- /dev/null +++ b/OpenUtau.Core/DiffSinger/DiffSingerRetake.cs @@ -0,0 +1,52 @@ +using System.Collections.Generic; + +namespace OpenUtau.Core.DiffSinger { + public static class DiffSingerRetake { + public static HashSet MapSelectedPositionsToNoteIndexes( + int phrasePosition, + IReadOnlyList noteRelativePositions, + IReadOnlyCollection? selectedAbsolutePositions) { + var result = new HashSet(); + if (selectedAbsolutePositions == null || selectedAbsolutePositions.Count == 0) { + return result; + } + var lookup = selectedAbsolutePositions as ISet ?? new HashSet(selectedAbsolutePositions); + for (int i = 0; i < noteRelativePositions.Count; i++) { + if (lookup.Contains(phrasePosition + noteRelativePositions[i])) { + result.Add(i); + } + } + return result; + } + + // paddedToRealNoteIndex must be the same length as paddedNoteDurations. + // Each entry is the real-note index the padded segment should follow for retake purposes, + // or -1 for a segment that is never retaken regardless of selection. + public static bool[] BuildRetakeFrameMask( + IReadOnlyList paddedNoteDurations, + IReadOnlyList paddedToRealNoteIndex, + IReadOnlyCollection? retakeNoteIndexes, + int totalFrames) { + var mask = new bool[totalFrames]; + if (retakeNoteIndexes == null || retakeNoteIndexes.Count == 0 || paddedNoteDurations.Count == 0) { + return mask; + } + var lookup = retakeNoteIndexes as ISet ?? new HashSet(retakeNoteIndexes); + int padded = paddedNoteDurations.Count; + int frameOffset = 0; + for (int segIdx = 0; segIdx < padded; segIdx++) { + int realIdx = paddedToRealNoteIndex[segIdx]; + bool shouldRetake = realIdx >= 0 && lookup.Contains(realIdx); + int dur = paddedNoteDurations[segIdx]; + for (int f = 0; f < dur; f++) { + int fi = frameOffset + f; + if (fi < totalFrames) { + mask[fi] = shouldRetake; + } + } + frameOffset += dur; + } + return mask; + } + } +} diff --git a/OpenUtau.Core/Editing/NoteBatchEdits.cs b/OpenUtau.Core/Editing/NoteBatchEdits.cs index 817315430..78a4d8b44 100644 --- a/OpenUtau.Core/Editing/NoteBatchEdits.cs +++ b/OpenUtau.Core/Editing/NoteBatchEdits.cs @@ -488,7 +488,7 @@ public void RunAsync( var commands = new List(); for (int ph_i = phrases.Count() - 1; ph_i >= 0; ph_i--) { var phrase = phrases[ph_i]; - var result = renderer.LoadRenderedPitch(phrase); + var result = renderer.LoadRenderedPitch(phrase, positions); if (result == null) { continue; } @@ -502,6 +502,9 @@ public void RunAsync( if (result.tones[i] < 0) { continue; } + if (result.retakeMask != null && i < result.retakeMask.Length && !result.retakeMask[i]) { + continue; + } int x = phrase.position - part.position + (int)result.ticks[i]; if (result.ticks[i] < 0) { if (i + 1 < result.ticks.Length && result.ticks[i + 1] > 0) { } else diff --git a/OpenUtau.Core/Render/IRenderer.cs b/OpenUtau.Core/Render/IRenderer.cs index a1d420cc1..9579a9171 100644 --- a/OpenUtau.Core/Render/IRenderer.cs +++ b/OpenUtau.Core/Render/IRenderer.cs @@ -40,6 +40,11 @@ public class RenderPitchResult { /// Semitone values in MIDI scale. /// public float[] tones; + + /// + /// Per-frame mask indicating retaken frames. Null means full retake. + /// + public bool[]? retakeMask; } public class RenderRealCurveResult { @@ -70,6 +75,7 @@ public interface IRenderer { RenderResult Layout(RenderPhrase phrase); Task Render(RenderPhrase phrase, Progress progress, int trackNo, CancellationTokenSource cancellation, bool isPreRender = false); RenderPitchResult LoadRenderedPitch(RenderPhrase phrase); + RenderPitchResult LoadRenderedPitch(RenderPhrase phrase, HashSet selectedNotePositions) { return LoadRenderedPitch(phrase); } List LoadRenderedRealCurves(RenderPhrase phrase) { return new List(0);} UExpressionDescriptor[] GetSuggestedExpressions(USinger singer, URenderSettings renderSettings); } diff --git a/OpenUtau.Core/Util/Preferences.cs b/OpenUtau.Core/Util/Preferences.cs index 46aa926f1..0cb0127f6 100644 --- a/OpenUtau.Core/Util/Preferences.cs +++ b/OpenUtau.Core/Util/Preferences.cs @@ -166,6 +166,7 @@ public class SerializablePreferences { public int DiffSingerStepsPitch = 10; public bool DiffSingerTensorCache = true; public bool DiffSingerLangCodeHide = false; + public bool DiffSingerLocalRetaking = false; public bool SkipRenderingMutedTracks = false; public string Language = string.Empty; public string? SortingOrder = null; diff --git a/OpenUtau.Test/Core/DiffSinger/DiffSingerRetakeTest.cs b/OpenUtau.Test/Core/DiffSinger/DiffSingerRetakeTest.cs new file mode 100644 index 000000000..88d1c4879 --- /dev/null +++ b/OpenUtau.Test/Core/DiffSinger/DiffSingerRetakeTest.cs @@ -0,0 +1,185 @@ +using System.Collections.Generic; +using System.Linq; +using Xunit; + +namespace OpenUtau.Core.DiffSinger { + public class DiffSingerRetakeTest { + [Fact] + public void MapSelectedPositionsToNoteIndexes_PicksMatchingNotes() { + var noteRel = new[] { 0, 480, 960, 1440 }; + var selected = new HashSet { 100 + 480, 100 + 1440 }; + + var result = DiffSingerRetake.MapSelectedPositionsToNoteIndexes(100, noteRel, selected); + + Assert.Equal(new HashSet { 1, 3 }, result); + } + + [Fact] + public void MapSelectedPositionsToNoteIndexes_ReturnsEmptyWhenNoneSelected() { + var noteRel = new[] { 0, 480 }; + var result = DiffSingerRetake.MapSelectedPositionsToNoteIndexes(0, noteRel, new HashSet()); + Assert.Empty(result); + } + + [Fact] + public void MapSelectedPositionsToNoteIndexes_HandlesNullSelected() { + var noteRel = new[] { 0, 480 }; + var result = DiffSingerRetake.MapSelectedPositionsToNoteIndexes(0, noteRel, null); + Assert.Empty(result); + } + + [Fact] + public void BuildRetakeFrameMask_AllSelected_AllTrue() { + var paddedDurations = new[] { 2, 5, 5, 2 }; + var paddedToReal = new[] { 0, 0, 1, 1 }; + var totalFrames = paddedDurations.Sum(); + var indexes = new HashSet { 0, 1 }; + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, totalFrames); + + Assert.Equal(totalFrames, mask.Length); + Assert.All(mask, b => Assert.True(b)); + } + + [Fact] + public void BuildRetakeFrameMask_NoneSelected_AllFalse() { + var paddedDurations = new[] { 2, 5, 5, 2 }; + var paddedToReal = new[] { 0, 0, 1, 1 }; + var totalFrames = paddedDurations.Sum(); + var indexes = new HashSet(); + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, totalFrames); + + Assert.Equal(totalFrames, mask.Length); + Assert.All(mask, b => Assert.False(b)); + } + + [Fact] + public void BuildRetakeFrameMask_PartialSelected_RespectsHeadTailPaddingShift() { + // 3 real notes, padded with head + tail → 5 padded "note durations". + // Mapping: padded[0] → real 0 (head), padded[1] → real 0, padded[2] → real 1, + // padded[3] → real 2, padded[4] → real 2 (tail). + var paddedDurations = new[] { 2, 3, 4, 3, 2 }; // 14 frames total + var paddedToReal = new[] { 0, 0, 1, 2, 2 }; + int totalFrames = paddedDurations.Sum(); + var indexes = new HashSet { 1 }; // retake only middle real note + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, totalFrames); + + // padded[0] (frames 0-1, head→real 0) → false + Assert.False(mask[0]); + Assert.False(mask[1]); + // padded[1] (frames 2-4, real 0) → false + Assert.False(mask[2]); + Assert.False(mask[4]); + // padded[2] (frames 5-8, real 1) → true + Assert.True(mask[5]); + Assert.True(mask[8]); + // padded[3] (frames 9-11, real 2) → false + Assert.False(mask[9]); + Assert.False(mask[11]); + // padded[4] (frames 12-13, tail→real 2) → false + Assert.False(mask[12]); + Assert.False(mask[13]); + } + + [Fact] + public void BuildRetakeFrameMask_FirstRealNoteSelected_HeadPadIncluded() { + // Selecting real note 0 should mark both head (padded[0]) and padded[1] frames. + var paddedDurations = new[] { 2, 3, 4, 3, 2 }; + var paddedToReal = new[] { 0, 0, 1, 2, 2 }; + int totalFrames = paddedDurations.Sum(); + var indexes = new HashSet { 0 }; + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, totalFrames); + + Assert.True(mask[0]); + Assert.True(mask[1]); // head padded → real 0 + Assert.True(mask[2]); + Assert.True(mask[4]); // padded[1] → real 0 + Assert.False(mask[5]); // padded[2] → real 1, not selected + } + + [Fact] + public void BuildRetakeFrameMask_LastRealNoteSelected_TailPadIncluded() { + var paddedDurations = new[] { 2, 3, 4, 3, 2 }; + var paddedToReal = new[] { 0, 0, 1, 2, 2 }; + int totalFrames = paddedDurations.Sum(); + var indexes = new HashSet { 2 }; // last real note + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, totalFrames); + + Assert.False(mask[8]); + Assert.True(mask[9]); // padded[3] → real 2 + Assert.True(mask[11]); + Assert.True(mask[12]); // padded[4] tail → real 2 + Assert.True(mask[13]); + } + + [Fact] + public void BuildRetakeFrameMask_ClampsFramesPastTotal() { + // paddedDurations sum to 10 but totalFrames is 8 (simulating FitDurationSum trim). + var paddedDurations = new[] { 2, 4, 4 }; + var paddedToReal = new[] { 0, 0, 0 }; + var indexes = new HashSet { 0 }; + + var mask = DiffSingerRetake.BuildRetakeFrameMask(paddedDurations, paddedToReal, indexes, 8); + + Assert.Equal(8, mask.Length); + // Should not throw; frames past totalFrames silently dropped. + Assert.True(mask[0]); + Assert.True(mask[5]); + } + + [Fact] + public void BuildRetakeFrameMask_EmptyDurations_ReturnsAllFalse() { + var mask = DiffSingerRetake.BuildRetakeFrameMask( + new int[0], new int[0], new HashSet { 0 }, 4); + Assert.Equal(4, mask.Length); + Assert.All(mask, b => Assert.False(b)); + } + + [Fact] + public void BuildRetakeFrameMask_GapRestFollowsPreviousNote() { + // Real notes [0, 1] with a gap between them. + // Padded layout: head→0, real 0, gap→0 (follows prev), real 1, tail→1. + var paddedDurations = new[] { 2, 3, 2, 4, 2 }; // 13 frames total + var paddedToReal = new[] { 0, 0, 0, 1, 1 }; + int totalFrames = paddedDurations.Sum(); + + // Select real note 0 only: head + real 0 + gap should retake; real 1 + tail should not. + var maskSelectFirst = DiffSingerRetake.BuildRetakeFrameMask( + paddedDurations, paddedToReal, new HashSet { 0 }, totalFrames); + Assert.True(maskSelectFirst[0]); // head + Assert.True(maskSelectFirst[4]); // real 0 + Assert.True(maskSelectFirst[5]); // gap (follows real 0) + Assert.True(maskSelectFirst[6]); // gap + Assert.False(maskSelectFirst[7]); // real 1 + Assert.False(maskSelectFirst[12]); // tail (follows real 1) + + // Select real note 1 only: gap stays unretaken (it follows real 0). + var maskSelectSecond = DiffSingerRetake.BuildRetakeFrameMask( + paddedDurations, paddedToReal, new HashSet { 1 }, totalFrames); + Assert.False(maskSelectSecond[5]); // gap not retaken + Assert.False(maskSelectSecond[6]); + Assert.True(maskSelectSecond[7]); // real 1 + Assert.True(maskSelectSecond[12]); // tail + } + + [Fact] + public void BuildRetakeFrameMask_SegmentMarkedMinusOne_NeverRetakes() { + // -1 in paddedToRealNoteIndex means "never retake this segment". + var paddedDurations = new[] { 2, 3, 2, 4, 2 }; + var paddedToReal = new[] { 0, 0, -1, 1, 1 }; // gap as -1 + int totalFrames = paddedDurations.Sum(); + + var mask = DiffSingerRetake.BuildRetakeFrameMask( + paddedDurations, paddedToReal, new HashSet { 0, 1 }, totalFrames); + + Assert.True(mask[4]); // real 0 + Assert.False(mask[5]); // gap (-1) + Assert.False(mask[6]); + Assert.True(mask[7]); // real 1 + } + } +} diff --git a/OpenUtau/Strings/Strings.axaml b/OpenUtau/Strings/Strings.axaml index 7c08bab77..104d90967 100644 --- a/OpenUtau/Strings/Strings.axaml +++ b/OpenUtau/Strings/Strings.axaml @@ -627,6 +627,7 @@ Warning: this option removes custom presets. DiffSinger Render Steps for Acoustic DiffSinger Render Steps for Pitch DiffSinger Render Steps for Variance + DiffSinger Pitch Local Retaking GPU Machine Learning Runner Phase Compensation diff --git a/OpenUtau/Strings/Strings.zh-CN.axaml b/OpenUtau/Strings/Strings.zh-CN.axaml index 8c730fbaa..473153673 100644 --- a/OpenUtau/Strings/Strings.zh-CN.axaml +++ b/OpenUtau/Strings/Strings.zh-CN.axaml @@ -485,6 +485,7 @@ Syntax: prefix,suffix--> + DiffSinger 音高局部重录 机器学习运行器 相位修正 diff --git a/OpenUtau/ViewModels/PreferencesViewModel.cs b/OpenUtau/ViewModels/PreferencesViewModel.cs index 9db829b46..a740d45c4 100644 --- a/OpenUtau/ViewModels/PreferencesViewModel.cs +++ b/OpenUtau/ViewModels/PreferencesViewModel.cs @@ -121,6 +121,7 @@ public int SafeMaxThreadCount { [Reactive] public double DiffSingerDepth { get; set; } [Reactive] public bool DiffSingerTensorCache { get; set; } [Reactive] public bool DiffSingerLangCodeHide { get; set; } + [Reactive] public bool DiffSingerLocalRetaking { get; set; } // Advanced [Reactive] public bool RememberMid { get; set; } @@ -175,6 +176,7 @@ public PreferencesViewModel() { DiffSingerStepsPitch = Preferences.Default.DiffSingerStepsPitch; DiffSingerTensorCache = Preferences.Default.DiffSingerTensorCache; DiffSingerLangCodeHide = Preferences.Default.DiffSingerLangCodeHide; + DiffSingerLocalRetaking = Preferences.Default.DiffSingerLocalRetaking; SkipRenderingMutedTracks = Preferences.Default.SkipRenderingMutedTracks; ThemeName = Preferences.Default.ThemeName; PenPlusDefault = Preferences.Default.PenPlusDefault; @@ -398,6 +400,11 @@ public PreferencesViewModel() { Preferences.Default.DiffSingerLangCodeHide = useCache; Preferences.Save(); }); + this.WhenAnyValue(vm => vm.DiffSingerLocalRetaking) + .Subscribe(value => { + Preferences.Default.DiffSingerLocalRetaking = value; + Preferences.Save(); + }); this.WhenAnyValue(vm => vm.SkipRenderingMutedTracks) .Subscribe(skipRenderingMutedTracks => { Preferences.Default.SkipRenderingMutedTracks = skipRenderingMutedTracks; diff --git a/OpenUtau/Views/PreferencesDialog.axaml b/OpenUtau/Views/PreferencesDialog.axaml index e116bbe34..bbe07e8a4 100644 --- a/OpenUtau/Views/PreferencesDialog.axaml +++ b/OpenUtau/Views/PreferencesDialog.axaml @@ -329,6 +329,10 @@ + + + +