From 6b1089515415a0b76e6c88bc0008a3c90ed7da2e Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Mon, 17 Jun 2019 18:14:43 +0200 Subject: [PATCH 1/6] ogg opus file options --- cpp/libtribune-client/tribune_client.cpp | 7 ++ cpp/libtribune-client/tribune_client.h | 1 + cpp/libtribune-client/tribune_tts.pb.cc | 95 +++++++++++++++++++----- cpp/libtribune-client/tribune_tts.pb.h | 21 ++++++ cpp/tribune-client/main.cpp | 15 +++- proto/tribune_tts.proto | 3 + python/call_synthesize.py | 9 ++- python/tribune_client.py | 3 +- python/tribune_tts_pb2.py | 31 +++++--- python/wave_saver.py | 6 +- 10 files changed, 153 insertions(+), 38 deletions(-) diff --git a/cpp/libtribune-client/tribune_client.cpp b/cpp/libtribune-client/tribune_client.cpp index f1f0793..125c9ad 100644 --- a/cpp/libtribune-client/tribune_client.cpp +++ b/cpp/libtribune-client/tribune_client.cpp @@ -8,9 +8,16 @@ namespace techmo { namespace tribune { SynthesizeRequest build_request(const TribuneClientConfig& config, const std::string& text) { + if(config.use_opus and (config.sample_rate_hertz != 0 and config.sample_rate_hertz != 8000 + and config.sample_rate_hertz != 12000 and config.sample_rate_hertz != 16000 + and config.sample_rate_hertz != 24000 and config.sample_rate_hertz != 48000)){ + throw std::runtime_error("Only valid sample rates with Opus encoding are: 8000, 12000, 16000, 24000, 48000."); + } + SynthesizeRequest request; request.set_text(text); request.mutable_config()->set_sample_rate_hertz(config.sample_rate_hertz); + request.mutable_config()->set_use_opus(config.use_opus); return request; } diff --git a/cpp/libtribune-client/tribune_client.h b/cpp/libtribune-client/tribune_client.h index 72dfe0a..9a9e531 100644 --- a/cpp/libtribune-client/tribune_client.h +++ b/cpp/libtribune-client/tribune_client.h @@ -9,6 +9,7 @@ struct TribuneClientConfig { // Session ID is the best way to match log's from client application with these on server side. int grpc_timeout = 0; // Timeout in milliseconds used to set gRPC deadline - how long the client is willing to wait for a reply from the server. unsigned int sample_rate_hertz = 0; // Sample rate in Hz of synthesized audio. If set to 0, the service will use voice's original sample rate. + bool use_opus = false; }; struct TribuneAudioData { diff --git a/cpp/libtribune-client/tribune_tts.pb.cc b/cpp/libtribune-client/tribune_tts.pb.cc index 591d5dc..c2397dd 100644 --- a/cpp/libtribune-client/tribune_tts.pb.cc +++ b/cpp/libtribune-client/tribune_tts.pb.cc @@ -87,6 +87,7 @@ const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_ATTRIBUTE_SECTION ~0u, // no _oneof_case_ ~0u, // no _weak_field_map_ GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SynthesizeConfig, sample_rate_hertz_), + GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SynthesizeConfig, use_opus_), ~0u, // no _has_bits_ GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET(SynthesizeResponse, _internal_metadata_), ~0u, // no _extensions_ @@ -113,9 +114,9 @@ const ::google::protobuf::uint32 TableStruct::offsets[] GOOGLE_ATTRIBUTE_SECTION static const ::google::protobuf::internal::MigrationSchema schemas[] GOOGLE_ATTRIBUTE_SECTION_VARIABLE(protodesc_cold) = { { 0, -1, sizeof(SynthesizeRequest)}, { 7, -1, sizeof(SynthesizeConfig)}, - { 13, -1, sizeof(SynthesizeResponse)}, - { 20, -1, sizeof(AudioData)}, - { 28, -1, sizeof(Error)}, + { 14, -1, sizeof(SynthesizeResponse)}, + { 21, -1, sizeof(AudioData)}, + { 29, -1, sizeof(Error)}, }; static ::google::protobuf::Message const * const file_default_instances[] = { @@ -181,22 +182,22 @@ void AddDescriptorsImpl() { "\n\021tribune_tts.proto\022\016techmo.tribune\"S\n\021S" "ynthesizeRequest\022\014\n\004text\030\001 \001(\t\0220\n\006config" "\030\002 \001(\0132 .techmo.tribune.SynthesizeConfig" - "\"-\n\020SynthesizeConfig\022\031\n\021sample_rate_hert" - "z\030\001 \001(\005\"d\n\022SynthesizeResponse\022(\n\005audio\030\001" - " \001(\0132\031.techmo.tribune.AudioData\022$\n\005error" - "\030\002 \001(\0132\025.techmo.tribune.Error\"N\n\tAudioDa" - "ta\022\031\n\021sample_rate_hertz\030\001 \001(\005\022\017\n\007content" - "\030\002 \001(\014\022\025\n\rend_of_stream\030\003 \001(\010\"E\n\005Error\022\'" - "\n\004code\030\001 \001(\0162\031.techmo.tribune.ErrorCode\022" - "\023\n\013description\030\002 \001(\t*_\n\tErrorCode\022\013\n\007UNK" - "NOWN\020\000\022\013\n\007LICENCE\020\001\022\026\n\022TEXT_NORMALIZATIO" - "N\020\002\022\021\n\rTRANSCRIPTION\020\003\022\r\n\tSYNTHESIS\020\0042\\\n" - "\003TTS\022U\n\nSynthesize\022!.techmo.tribune.Synt" - "hesizeRequest\032\".techmo.tribune.Synthesiz" - "eResponse0\001b\006proto3" + "\"\?\n\020SynthesizeConfig\022\031\n\021sample_rate_hert" + "z\030\001 \001(\005\022\020\n\010use_opus\030\002 \001(\010\"d\n\022SynthesizeR" + "esponse\022(\n\005audio\030\001 \001(\0132\031.techmo.tribune." + "AudioData\022$\n\005error\030\002 \001(\0132\025.techmo.tribun" + "e.Error\"N\n\tAudioData\022\031\n\021sample_rate_hert" + "z\030\001 \001(\005\022\017\n\007content\030\002 \001(\014\022\025\n\rend_of_strea" + "m\030\003 \001(\010\"E\n\005Error\022\'\n\004code\030\001 \001(\0162\031.techmo." + "tribune.ErrorCode\022\023\n\013description\030\002 \001(\t*_" + "\n\tErrorCode\022\013\n\007UNKNOWN\020\000\022\013\n\007LICENCE\020\001\022\026\n" + "\022TEXT_NORMALIZATION\020\002\022\021\n\rTRANSCRIPTION\020\003" + "\022\r\n\tSYNTHESIS\020\0042\\\n\003TTS\022U\n\nSynthesize\022!.t" + "echmo.tribune.SynthesizeRequest\032\".techmo" + ".tribune.SynthesizeResponse0\001b\006proto3" }; ::google::protobuf::DescriptorPool::InternalAddGeneratedFile( - descriptor, 619); + descriptor, 637); ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile( "tribune_tts.proto", &protobuf_RegisterTypes); } @@ -636,6 +637,7 @@ void SynthesizeRequest::set_allocated_config(::techmo::tribune::SynthesizeConfig #if !defined(_MSC_VER) || _MSC_VER >= 1900 const int SynthesizeConfig::kSampleRateHertzFieldNumber; +const int SynthesizeConfig::kUseOpusFieldNumber; #endif // !defined(_MSC_VER) || _MSC_VER >= 1900 SynthesizeConfig::SynthesizeConfig() @@ -651,12 +653,16 @@ SynthesizeConfig::SynthesizeConfig(const SynthesizeConfig& from) _internal_metadata_(NULL), _cached_size_(0) { _internal_metadata_.MergeFrom(from._internal_metadata_); - sample_rate_hertz_ = from.sample_rate_hertz_; + ::memcpy(&sample_rate_hertz_, &from.sample_rate_hertz_, + static_cast(reinterpret_cast(&use_opus_) - + reinterpret_cast(&sample_rate_hertz_)) + sizeof(use_opus_)); // @@protoc_insertion_point(copy_constructor:techmo.tribune.SynthesizeConfig) } void SynthesizeConfig::SharedCtor() { - sample_rate_hertz_ = 0; + ::memset(&sample_rate_hertz_, 0, static_cast( + reinterpret_cast(&use_opus_) - + reinterpret_cast(&sample_rate_hertz_)) + sizeof(use_opus_)); _cached_size_ = 0; } @@ -697,7 +703,9 @@ void SynthesizeConfig::Clear() { // Prevent compiler warnings about cached_has_bits being unused (void) cached_has_bits; - sample_rate_hertz_ = 0; + ::memset(&sample_rate_hertz_, 0, static_cast( + reinterpret_cast(&use_opus_) - + reinterpret_cast(&sample_rate_hertz_)) + sizeof(use_opus_)); _internal_metadata_.Clear(); } @@ -725,6 +733,20 @@ bool SynthesizeConfig::MergePartialFromCodedStream( break; } + // bool use_opus = 2; + case 2: { + if (static_cast< ::google::protobuf::uint8>(tag) == + static_cast< ::google::protobuf::uint8>(16u /* 16 & 0xFF */)) { + + DO_((::google::protobuf::internal::WireFormatLite::ReadPrimitive< + bool, ::google::protobuf::internal::WireFormatLite::TYPE_BOOL>( + input, &use_opus_))); + } else { + goto handle_unusual; + } + break; + } + default: { handle_unusual: if (tag == 0) { @@ -756,6 +778,11 @@ void SynthesizeConfig::SerializeWithCachedSizes( ::google::protobuf::internal::WireFormatLite::WriteInt32(1, this->sample_rate_hertz(), output); } + // bool use_opus = 2; + if (this->use_opus() != 0) { + ::google::protobuf::internal::WireFormatLite::WriteBool(2, this->use_opus(), output); + } + if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) { ::google::protobuf::internal::WireFormat::SerializeUnknownFields( (::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), output); @@ -775,6 +802,11 @@ ::google::protobuf::uint8* SynthesizeConfig::InternalSerializeWithCachedSizesToA target = ::google::protobuf::internal::WireFormatLite::WriteInt32ToArray(1, this->sample_rate_hertz(), target); } + // bool use_opus = 2; + if (this->use_opus() != 0) { + target = ::google::protobuf::internal::WireFormatLite::WriteBoolToArray(2, this->use_opus(), target); + } + if ((_internal_metadata_.have_unknown_fields() && ::google::protobuf::internal::GetProto3PreserveUnknownsDefault())) { target = ::google::protobuf::internal::WireFormat::SerializeUnknownFieldsToArray( (::google::protobuf::internal::GetProto3PreserveUnknownsDefault() ? _internal_metadata_.unknown_fields() : _internal_metadata_.default_instance()), target); @@ -799,6 +831,11 @@ size_t SynthesizeConfig::ByteSizeLong() const { this->sample_rate_hertz()); } + // bool use_opus = 2; + if (this->use_opus() != 0) { + total_size += 1 + 1; + } + int cached_size = ::google::protobuf::internal::ToCachedSize(total_size); GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN(); _cached_size_ = cached_size; @@ -831,6 +868,9 @@ void SynthesizeConfig::MergeFrom(const SynthesizeConfig& from) { if (from.sample_rate_hertz() != 0) { set_sample_rate_hertz(from.sample_rate_hertz()); } + if (from.use_opus() != 0) { + set_use_opus(from.use_opus()); + } } void SynthesizeConfig::CopyFrom(const ::google::protobuf::Message& from) { @@ -858,6 +898,7 @@ void SynthesizeConfig::Swap(SynthesizeConfig* other) { void SynthesizeConfig::InternalSwap(SynthesizeConfig* other) { using std::swap; swap(sample_rate_hertz_, other->sample_rate_hertz_); + swap(use_opus_, other->use_opus_); _internal_metadata_.Swap(&other->_internal_metadata_); swap(_cached_size_, other->_cached_size_); } @@ -884,6 +925,20 @@ void SynthesizeConfig::set_sample_rate_hertz(::google::protobuf::int32 value) { // @@protoc_insertion_point(field_set:techmo.tribune.SynthesizeConfig.sample_rate_hertz) } +// bool use_opus = 2; +void SynthesizeConfig::clear_use_opus() { + use_opus_ = false; +} +bool SynthesizeConfig::use_opus() const { + // @@protoc_insertion_point(field_get:techmo.tribune.SynthesizeConfig.use_opus) + return use_opus_; +} +void SynthesizeConfig::set_use_opus(bool value) { + + use_opus_ = value; + // @@protoc_insertion_point(field_set:techmo.tribune.SynthesizeConfig.use_opus) +} + #endif // PROTOBUF_INLINE_NOT_IN_HEADERS // =================================================================== diff --git a/cpp/libtribune-client/tribune_tts.pb.h b/cpp/libtribune-client/tribune_tts.pb.h index 2d6a509..02d4359 100644 --- a/cpp/libtribune-client/tribune_tts.pb.h +++ b/cpp/libtribune-client/tribune_tts.pb.h @@ -297,11 +297,18 @@ class SynthesizeConfig : public ::google::protobuf::Message /* @@protoc_insertio ::google::protobuf::int32 sample_rate_hertz() const; void set_sample_rate_hertz(::google::protobuf::int32 value); + // bool use_opus = 2; + void clear_use_opus(); + static const int kUseOpusFieldNumber = 2; + bool use_opus() const; + void set_use_opus(bool value); + // @@protoc_insertion_point(class_scope:techmo.tribune.SynthesizeConfig) private: ::google::protobuf::internal::InternalMetadataWithArena _internal_metadata_; ::google::protobuf::int32 sample_rate_hertz_; + bool use_opus_; mutable int _cached_size_; friend struct protobuf_tribune_5ftts_2eproto::TableStruct; }; @@ -769,6 +776,20 @@ inline void SynthesizeConfig::set_sample_rate_hertz(::google::protobuf::int32 va // @@protoc_insertion_point(field_set:techmo.tribune.SynthesizeConfig.sample_rate_hertz) } +// bool use_opus = 2; +inline void SynthesizeConfig::clear_use_opus() { + use_opus_ = false; +} +inline bool SynthesizeConfig::use_opus() const { + // @@protoc_insertion_point(field_get:techmo.tribune.SynthesizeConfig.use_opus) + return use_opus_; +} +inline void SynthesizeConfig::set_use_opus(bool value) { + + use_opus_ = value; + // @@protoc_insertion_point(field_set:techmo.tribune.SynthesizeConfig.use_opus) +} + // ------------------------------------------------------------------- // SynthesizeResponse diff --git a/cpp/tribune-client/main.cpp b/cpp/tribune-client/main.cpp index f1d26e6..498a66b 100644 --- a/cpp/tribune-client/main.cpp +++ b/cpp/tribune-client/main.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -26,7 +27,9 @@ po::options_description CreateOptionsDescription(void) { "how long the client is willing to wait for a reply from the server. " "If not specified, the service will set the deadline to a very large number.") ("sample-rate-hertz", po::value()->default_value(0), - "Sample rate in Hz of synthesized audio. Set to 0 (default) to use voice's original sample rate."); + "Sample rate in Hz of synthesized audio. Set to 0 (default) to use voice's original sample rate.") + ("use-opus", + "Flag to compress audio using Opus codec, default: false"); return optionsDescription; } @@ -58,12 +61,20 @@ int main(int argc, const char *const argv[]) { config.session_id = userOptions["session-id"].as(); config.grpc_timeout = userOptions["grpc-timeout"].as(); config.sample_rate_hertz = sample_rate_hertz; + config.use_opus = userOptions.count("use-opus"); techmo::tribune::TribuneClient tribune_client{ userOptions["service-address"].as() }; const auto audio_data = tribune_client.Synthesize(config, userOptions["text"].as()); - WriteWaveFile(userOptions["out-path"].as(), audio_data.sample_rate_hertz, audio_data.audio_bytes); + if(config.use_opus == false) { + WriteWaveFile(userOptions["out-path"].as(), audio_data.sample_rate_hertz, audio_data.audio_bytes); + } + else { + std::fstream file(userOptions["out-path"].as(), std::ios::binary | std::ios::trunc | std::ios::out); + file.write(audio_data.audio_bytes.data(), audio_data.audio_bytes.size()); + file.flush(); + } } catch (const std::exception &e) { std::cerr << e.what() << std::endl; diff --git a/proto/tribune_tts.proto b/proto/tribune_tts.proto index 0388efe..3c0e111 100755 --- a/proto/tribune_tts.proto +++ b/proto/tribune_tts.proto @@ -50,6 +50,9 @@ message SynthesizeConfig { // Desired sampling frequency in hertz of synthesized audio. int32 sample_rate_hertz = 1; + + // Use OggOpus compression and save to `.opus` file + bool use_opus = 2; } // `SynthesizeResponse` is the only message returned to the client by diff --git a/python/call_synthesize.py b/python/call_synthesize.py index 11fc813..840ff39 100644 --- a/python/call_synthesize.py +++ b/python/call_synthesize.py @@ -14,7 +14,9 @@ def call_synthesize(args, text): stub = tribune_tts_pb2_grpc.TTSStub(channel) # Synthesis request - config = tribune_tts_pb2.SynthesizeConfig(sample_rate_hertz=int(args.sample_rate)) + config = tribune_tts_pb2.SynthesizeConfig(sample_rate_hertz=int(args.sample_rate), use_opus=bool(args.use_opus)) + if config.use_opus is True and config.sample_rate_hertz not in [0, 8000, 12000, 16000, 24000, 48000]: + raise RuntimeError("Only valid sample rates with opus encoding are: 8000, 12000, 16000, 24000, 48000.") request = tribune_tts_pb2.SynthesizeRequest(text=text, config=config) ws = WaveSaver() @@ -38,7 +40,10 @@ def call_synthesize(args, text): ws.setFrameRate(response.audio.sample_rate_hertz) ws.append(response.audio.content) if response.audio.end_of_stream: - ws.save(wavefilename) + if config.use_opus is False: + ws.save(wavefilename) + else: + ws.save_raw(wavefilename) except grpc.RpcError as e: print("[Server-side error] Received following RPC error from the TTS service:", str(e)) ws.clear() diff --git a/python/tribune_client.py b/python/tribune_client.py index 43cfc0c..6fc1dd2 100755 --- a/python/tribune_client.py +++ b/python/tribune_client.py @@ -22,7 +22,8 @@ def main(): help="Path to output wave file with synthesized audio content.", type=str) parser.add_argument("-f", "--sample_rate", dest="sample_rate", default=0, help="Sample rate in Hz of synthesized audio. Set to 0 (default) to use voice's original sample rate.", type=int) - + parser.add_argument("--use-opus", action='store_true', + help="Flag to compress audio using Opus codec, default: false") # Parse and validate options args = parser.parse_args() diff --git a/python/tribune_tts_pb2.py b/python/tribune_tts_pb2.py index f89dea2..ad6857a 100644 --- a/python/tribune_tts_pb2.py +++ b/python/tribune_tts_pb2.py @@ -20,7 +20,7 @@ name='tribune_tts.proto', package='techmo.tribune', syntax='proto3', - serialized_pb=_b('\n\x11tribune_tts.proto\x12\x0etechmo.tribune\"S\n\x11SynthesizeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x30\n\x06\x63onfig\x18\x02 \x01(\x0b\x32 .techmo.tribune.SynthesizeConfig\"-\n\x10SynthesizeConfig\x12\x19\n\x11sample_rate_hertz\x18\x01 \x01(\x05\"d\n\x12SynthesizeResponse\x12(\n\x05\x61udio\x18\x01 \x01(\x0b\x32\x19.techmo.tribune.AudioData\x12$\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x15.techmo.tribune.Error\"N\n\tAudioData\x12\x19\n\x11sample_rate_hertz\x18\x01 \x01(\x05\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\x0c\x12\x15\n\rend_of_stream\x18\x03 \x01(\x08\"E\n\x05\x45rror\x12\'\n\x04\x63ode\x18\x01 \x01(\x0e\x32\x19.techmo.tribune.ErrorCode\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t*_\n\tErrorCode\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x0b\n\x07LICENCE\x10\x01\x12\x16\n\x12TEXT_NORMALIZATION\x10\x02\x12\x11\n\rTRANSCRIPTION\x10\x03\x12\r\n\tSYNTHESIS\x10\x04\x32\\\n\x03TTS\x12U\n\nSynthesize\x12!.techmo.tribune.SynthesizeRequest\x1a\".techmo.tribune.SynthesizeResponse0\x01\x62\x06proto3') + serialized_pb=_b('\n\x11tribune_tts.proto\x12\x0etechmo.tribune\"S\n\x11SynthesizeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x30\n\x06\x63onfig\x18\x02 \x01(\x0b\x32 .techmo.tribune.SynthesizeConfig\"?\n\x10SynthesizeConfig\x12\x19\n\x11sample_rate_hertz\x18\x01 \x01(\x05\x12\x10\n\x08use_opus\x18\x02 \x01(\x08\"d\n\x12SynthesizeResponse\x12(\n\x05\x61udio\x18\x01 \x01(\x0b\x32\x19.techmo.tribune.AudioData\x12$\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x15.techmo.tribune.Error\"N\n\tAudioData\x12\x19\n\x11sample_rate_hertz\x18\x01 \x01(\x05\x12\x0f\n\x07\x63ontent\x18\x02 \x01(\x0c\x12\x15\n\rend_of_stream\x18\x03 \x01(\x08\"E\n\x05\x45rror\x12\'\n\x04\x63ode\x18\x01 \x01(\x0e\x32\x19.techmo.tribune.ErrorCode\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t*_\n\tErrorCode\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x0b\n\x07LICENCE\x10\x01\x12\x16\n\x12TEXT_NORMALIZATION\x10\x02\x12\x11\n\rTRANSCRIPTION\x10\x03\x12\r\n\tSYNTHESIS\x10\x04\x32\\\n\x03TTS\x12U\n\nSynthesize\x12!.techmo.tribune.SynthesizeRequest\x1a\".techmo.tribune.SynthesizeResponse0\x01\x62\x06proto3') ) _ERRORCODE = _descriptor.EnumDescriptor( @@ -52,8 +52,8 @@ ], containing_type=None, options=None, - serialized_start=422, - serialized_end=517, + serialized_start=440, + serialized_end=535, ) _sym_db.RegisterEnumDescriptor(_ERRORCODE) @@ -118,6 +118,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='use_opus', full_name='techmo.tribune.SynthesizeConfig.use_opus', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), ], extensions=[ ], @@ -131,7 +138,7 @@ oneofs=[ ], serialized_start=122, - serialized_end=167, + serialized_end=185, ) @@ -168,8 +175,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=169, - serialized_end=269, + serialized_start=187, + serialized_end=287, ) @@ -213,8 +220,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=271, - serialized_end=349, + serialized_start=289, + serialized_end=367, ) @@ -251,8 +258,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=351, - serialized_end=420, + serialized_start=369, + serialized_end=438, ) _SYNTHESIZEREQUEST.fields_by_name['config'].message_type = _SYNTHESIZECONFIG @@ -310,8 +317,8 @@ file=DESCRIPTOR, index=0, options=None, - serialized_start=519, - serialized_end=611, + serialized_start=537, + serialized_end=629, methods=[ _descriptor.MethodDescriptor( name='Synthesize', diff --git a/python/wave_saver.py b/python/wave_saver.py index 7f47b1a..25714c9 100755 --- a/python/wave_saver.py +++ b/python/wave_saver.py @@ -28,7 +28,11 @@ def save(self, filename): params = (self._nchannels, self._sampwidth, self._framerate, len(self.buffer), 'NONE', 'not compressed') w.setparams(params) w.writeframes(self.buffer) - + + def save_raw(self, filename): + with open(filename, 'w+b') as f: + f.write(self.buffer) + def load(self, filename): with wave.open(filename, 'r') as wr: self.buffer = wr.readframes(wr.getnframes()) From 80164c91d4e6f9d6f4c58c9904084b56f8d00465 Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Tue, 25 Jun 2019 17:14:21 +0200 Subject: [PATCH 2/6] wip --- README.md | 13 +-------- cpp/CHANGELOG.md | 4 +++ cpp/libtribune-client/VERSION.h | 2 +- cpp/libtribune-client/tribune_client.cpp | 6 ++-- cpp/libtribune-client/tribune_client.h | 6 +++- cpp/tribune-client/main.cpp | 17 +++++++++--- proto/tribune_tts.proto | 26 ++++++++---------- python/CHANGELOG.md | 4 +++ python/VERSION.py | 2 +- python/call_synthesize.py | 15 ++++------ python/saver_factory.py | 35 ++++++++++++++++++++++++ python/tribune_client.py | 5 ++-- 12 files changed, 86 insertions(+), 49 deletions(-) create mode 100644 python/saver_factory.py diff --git a/README.md b/README.md index 6409c03..ec597af 100644 --- a/README.md +++ b/README.md @@ -15,18 +15,7 @@ Language-specific build instructions can be found in their respective directorie Techmo TTS Service API is defined in `proto/TTS.proto` file. Service's `Synthesize` method accepts `SynthesizeRequest` object which contains whole phrase to be synthesized. -You have to put the phrase as a string in `text` field of `SynthesizeRequest`. The string has to be in orthographic form. In that string you can use several special tags which can be interpreted. Tags have to be in from `something special` and can occur in any place in text. Currently interpreted tags are: - -cardinal cardinal number "7" -> "siedem" -signed number with sign "-15" -> "minus piętnaście" -ordinal ordinal number "1" -> "pierwszy" -fraction fractional number "3/4" -> "trzy czwarte" -postal postal code "30-020" -> "trzydzieści zero dwadzieścia" -time time "" -> "dwudziesta druga" -date date "12/05/2001" -> "dwunasty maja dwa tysiące jeden" - -Note: when interpreting tags only nominal case is supported at the moment. - +You have to put the phrase as a string in `text` field of `SynthesizeRequest`. The string has to be in orthographic form. You can set `SynthesizeConfig`'s fields to specify parameters of synthesis. Currently supported option is only `sample_rate_hertz`, which is desired sampling frequency (in hertz) of synthesized audio. `SynthesizeRequest` can be sent to the service via gRPC insecure channel (that does not require authentication). diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 33273cc..0f54eff 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -1,5 +1,9 @@ # Tribune TTS gRPC C++ client Changelog +## [1.3.0] - 2019-06-24 +### Added +- `audio-encoding` option. + ## [1.2.0] - 2018-12-12 ### Added - Support for setting gRPC deadline (how long the client is willing to wait for a reply from the server). diff --git a/cpp/libtribune-client/VERSION.h b/cpp/libtribune-client/VERSION.h index 7da2f5f..4b22c6d 100644 --- a/cpp/libtribune-client/VERSION.h +++ b/cpp/libtribune-client/VERSION.h @@ -1 +1 @@ -static constexpr auto LIBTRIBUNE_CLIENT_VERSION = "1.2.0"; +static constexpr auto LIBTRIBUNE_CLIENT_VERSION = "1.3.0"; diff --git a/cpp/libtribune-client/tribune_client.cpp b/cpp/libtribune-client/tribune_client.cpp index 125c9ad..3fb350c 100644 --- a/cpp/libtribune-client/tribune_client.cpp +++ b/cpp/libtribune-client/tribune_client.cpp @@ -8,10 +8,8 @@ namespace techmo { namespace tribune { SynthesizeRequest build_request(const TribuneClientConfig& config, const std::string& text) { - if(config.use_opus and (config.sample_rate_hertz != 0 and config.sample_rate_hertz != 8000 - and config.sample_rate_hertz != 12000 and config.sample_rate_hertz != 16000 - and config.sample_rate_hertz != 24000 and config.sample_rate_hertz != 48000)){ - throw std::runtime_error("Only valid sample rates with Opus encoding are: 8000, 12000, 16000, 24000, 48000."); + if(config.encoding == AudioEncoding::OGG_OPUS and config.sample_rate_hertz != 0 ){ + throw std::runtime_error("Custom sample rate is not supported with Opus compression."); } SynthesizeRequest request; diff --git a/cpp/libtribune-client/tribune_client.h b/cpp/libtribune-client/tribune_client.h index 9a9e531..5ac6019 100644 --- a/cpp/libtribune-client/tribune_client.h +++ b/cpp/libtribune-client/tribune_client.h @@ -4,12 +4,16 @@ namespace techmo { namespace tribune { +enum AudioEncoding{ + LINEAR16, OGG_OPUS +}; + struct TribuneClientConfig { std::string session_id = ""; // Session ID to be passed to the service. If not specified, the service will generate a default session ID itself. // Session ID is the best way to match log's from client application with these on server side. int grpc_timeout = 0; // Timeout in milliseconds used to set gRPC deadline - how long the client is willing to wait for a reply from the server. unsigned int sample_rate_hertz = 0; // Sample rate in Hz of synthesized audio. If set to 0, the service will use voice's original sample rate. - bool use_opus = false; + AudioEncoding encoding = AudioEncoding::OGG_OPUS; }; struct TribuneAudioData { diff --git a/cpp/tribune-client/main.cpp b/cpp/tribune-client/main.cpp index 498a66b..d06b133 100644 --- a/cpp/tribune-client/main.cpp +++ b/cpp/tribune-client/main.cpp @@ -28,8 +28,8 @@ po::options_description CreateOptionsDescription(void) { "If not specified, the service will set the deadline to a very large number.") ("sample-rate-hertz", po::value()->default_value(0), "Sample rate in Hz of synthesized audio. Set to 0 (default) to use voice's original sample rate.") - ("use-opus", - "Flag to compress audio using Opus codec, default: false"); + ("audio-encoding", po::value()->default_value("LINEAR16"), + "Audio encoding, possible values are: LINEAR16, OGG_OPUS"); return optionsDescription; } @@ -61,13 +61,22 @@ int main(int argc, const char *const argv[]) { config.session_id = userOptions["session-id"].as(); config.grpc_timeout = userOptions["grpc-timeout"].as(); config.sample_rate_hertz = sample_rate_hertz; - config.use_opus = userOptions.count("use-opus"); + const auto encoding = userOptions["audio-encoding"].as(); + if(encoding == "LINEAR16") { + config.encoding = techmo::tribune::AudioEncoding::LINEAR16; + } + else if(encoding == "OGG_OPUS") { + config.encoding = techmo::tribune::AudioEncoding::OGG_OPUS; + } + else { + throw std::runtime_error("Unknown audio encoding: " + encoding); + } techmo::tribune::TribuneClient tribune_client{ userOptions["service-address"].as() }; const auto audio_data = tribune_client.Synthesize(config, userOptions["text"].as()); - if(config.use_opus == false) { + if(config.encoding == techmo::tribune::AudioEncoding::LINEAR16) { WriteWaveFile(userOptions["out-path"].as(), audio_data.sample_rate_hertz, audio_data.audio_bytes); } else { diff --git a/proto/tribune_tts.proto b/proto/tribune_tts.proto index 3c0e111..9540c61 100755 --- a/proto/tribune_tts.proto +++ b/proto/tribune_tts.proto @@ -9,19 +9,7 @@ package techmo.tribune; // Service that implements Techmo Text-To-Speech (TTS) API. // // Service's `Synthesize` method accepts `SynthesizeRequest` object which contains whole phrase to be synthesized. -// You have to put the phrase as a string in `text` field of `SynthesizeRequest`. The string has to be in orthographic form. In that string you can use several special tags which can be interpreted. Tags have to be in from `something special` and can occur in any place in text. Currently interpreted tags are: -// -// | Tag | Description | Example (input) | Example (output) | -// | --- | ----------- | --------------- | ---------------- | -// | **cardinal** | *cardinal number* | `7` | *siedem* | -// | **signed** | *number with sign* | `-15` | *minus piętnaście* | -// | **ordinal** | *ordinal number* | `1` | *pierwszy* | -// | **fraction** | *fractional number* | `3/4` | *trzy czwarte* | -// | **postal** | *postal code* | `30-020` | *trzydzieści zero dwadzieścia* | -// | **time** | *time* | `` | *dwudziesta druga* | -// | **date** | *date* | `12/05/2001` | *dwunasty maja dwa tysiące jeden* | -// -// Note: when interpreting tags only nominal case is supported at the moment. +// You have to put the phrase as a string in `text` field of `SynthesizeRequest`. The string has to be in orthographic form. // // You can set `SynthesizeConfig`'s fields to specify parameters of synthesis. Currently supported option is only `sample_rate_hertz`, which is desired sampling frequency (in hertz) of synthesized audio. // @@ -45,14 +33,22 @@ message SynthesizeRequest SynthesizeConfig config = 2; } +enum AudioEncoding { + // Signed 16 bit little endian PCM. + LINEAR16 = 0; + // Opus compressed audio in Ogg file container. + OGG_OPUS = 1; + } + // Provides information to the synthesizer that specifies how to process the request. message SynthesizeConfig { // Desired sampling frequency in hertz of synthesized audio. int32 sample_rate_hertz = 1; - // Use OggOpus compression and save to `.opus` file - bool use_opus = 2; + // Audio Format. + // Using OGG_OPUS will result in sample_rate_hertz being ignored. + AudioEncoding encoding = 2; } // `SynthesizeResponse` is the only message returned to the client by diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 798a485..24af7e1 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -1,5 +1,9 @@ # Tribune TTS gRPC Python client Changelog +## [1.3.0] - 2019-06-24 +### Added +- `audio-encoding` option. + ## [1.2.0] - 2018-12-12 ### Added - Support for setting gRPC deadline (how long the client is willing to wait for a reply from the server). diff --git a/python/VERSION.py b/python/VERSION.py index f330d79..3258c7c 100644 --- a/python/VERSION.py +++ b/python/VERSION.py @@ -1 +1 @@ -TRIBUNE_CLIENT_VERSION = '1.2.0' +TRIBUNE_CLIENT_VERSION = '1.3.0' diff --git a/python/call_synthesize.py b/python/call_synthesize.py index 840ff39..f065b37 100644 --- a/python/call_synthesize.py +++ b/python/call_synthesize.py @@ -2,7 +2,7 @@ import tribune_tts_pb2_grpc import grpc import os -from wave_saver import WaveSaver +from saver_factory import SaverFactory def call_synthesize(args, text): @@ -14,11 +14,11 @@ def call_synthesize(args, text): stub = tribune_tts_pb2_grpc.TTSStub(channel) # Synthesis request - config = tribune_tts_pb2.SynthesizeConfig(sample_rate_hertz=int(args.sample_rate), use_opus=bool(args.use_opus)) - if config.use_opus is True and config.sample_rate_hertz not in [0, 8000, 12000, 16000, 24000, 48000]: - raise RuntimeError("Only valid sample rates with opus encoding are: 8000, 12000, 16000, 24000, 48000.") + config = tribune_tts_pb2.SynthesizeConfig(sample_rate_hertz=int(args.sample_rate), encoding=args.audio_encoding) + if config.encoding is tribune_tts_pb2.AudioEncoding.Value('OGG_OPUS') and config.sample_rate_hertz is not 0: + raise RuntimeError("Custom sample rate is not supported with Opus compression.") request = tribune_tts_pb2.SynthesizeRequest(text=text, config=config) - ws = WaveSaver() + ws = SaverFactory.get_saver(encoding=config.encoding) timeout=None if args.grpc_timeout > 0: @@ -40,10 +40,7 @@ def call_synthesize(args, text): ws.setFrameRate(response.audio.sample_rate_hertz) ws.append(response.audio.content) if response.audio.end_of_stream: - if config.use_opus is False: - ws.save(wavefilename) - else: - ws.save_raw(wavefilename) + ws.save(wavefilename) except grpc.RpcError as e: print("[Server-side error] Received following RPC error from the TTS service:", str(e)) ws.clear() diff --git a/python/saver_factory.py b/python/saver_factory.py new file mode 100644 index 0000000..4aa98bd --- /dev/null +++ b/python/saver_factory.py @@ -0,0 +1,35 @@ +import tribune_tts_pb2 +import tribune_tts_pb2_grpc +import grpc +import os +from wave_saver import WaveSaver + +class SimpleSaver: + """Raw saver for TTS""" + buffer = None + _framerate = None + + def __init__(self, sampling_frequency = None): + self.buffer = bytearray() + _framerate = None + + def setFrameRate(self, sampling_frequency): + self._framerate = sampling_frequency + + def append(self, audiodata): + self.buffer += audiodata + + def clear(self): + self.buffer.clear() + + def save(self, filename): + with open(filename, 'w+b') as f: + f.write(self.buffer) + +class SaverFactory: + @staticmethod + def get_saver(encoding): + if encoding == tribune_tts_pb2.AudioEncoding.Value('OGG_OPUS'): + return SimpleSaver() + else: + return WaveSaver() diff --git a/python/tribune_client.py b/python/tribune_client.py index 6fc1dd2..003ad0a 100755 --- a/python/tribune_client.py +++ b/python/tribune_client.py @@ -22,8 +22,9 @@ def main(): help="Path to output wave file with synthesized audio content.", type=str) parser.add_argument("-f", "--sample_rate", dest="sample_rate", default=0, help="Sample rate in Hz of synthesized audio. Set to 0 (default) to use voice's original sample rate.", type=int) - parser.add_argument("--use-opus", action='store_true', - help="Flag to compress audio using Opus codec, default: false") + parser.add_argument("--audio-encoding", dest="audio_encoding", default="LINEAR16", + help="Audio encoding, possible values are: LINEAR16, OGG_OPUS", type=str) + # Parse and validate options args = parser.parse_args() From e291c167644126748190638319dea730f2fa44be Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Tue, 25 Jun 2019 18:24:15 +0200 Subject: [PATCH 3/6] proto --- proto/tribune_tts.proto | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/proto/tribune_tts.proto b/proto/tribune_tts.proto index 9540c61..e2926ab 100755 --- a/proto/tribune_tts.proto +++ b/proto/tribune_tts.proto @@ -1,21 +1,29 @@ // Techmo Tribune TTS API -// version: 1.1.0 -// authors: Dawid Skurzok, Paweł Jaciów -// date: 2018-01-25 +// version: 1.2.0 +// authors: Dawid Skurzok, Paweł Jaciów, Jerzy Skrzypek +// date: 2019-06-25 syntax = "proto3"; package techmo.tribune; -// Service that implements Techmo Text-To-Speech (TTS) API. +// Techmo Text-To-Speech (TTS) API. // // Service's `Synthesize` method accepts `SynthesizeRequest` object which contains whole phrase to be synthesized. // You have to put the phrase as a string in `text` field of `SynthesizeRequest`. The string has to be in orthographic form. // -// You can set `SynthesizeConfig`'s fields to specify parameters of synthesis. Currently supported option is only `sample_rate_hertz`, which is desired sampling frequency (in hertz) of synthesized audio. +// You can set `SynthesizeConfig`'s fields to specify parameters of synthesis. +// Service currently supports audio in two formats: sign 16 bit little-endian PCM wave or Ogg Opus. +// `sample_rate_hertz` can be set to change sampling rate of synthesized audio for wave PCM only. // // `SynthesizeRequest` can be sent to the service via gRPC insecure channel (that does not require authentication). // -// `Synthesize` returns synthesized audio in `SynthesizeResponse` as a stream. When reading from the stream you have to check if `SynthesizeResponse` contains `error` field. If it does you can print its `code` and `description`. No `error` field in `SynthesizeResponse` means everything worked fine and its `audio` contains byte `content` that can be appended to received audio samples with `sample_rate_hertz` sampling frequency in hertz. When receiving `SynthesizeResponse` with `audio` you have to check if its `end_of_stream` flag is set to true. When it is set to true it means service has fnished synthesis and you can save your wave file with received synthesized audio content. +// `Synthesize` returns synthesized audio in `SynthesizeResponse` as a stream. +// When reading from the stream you have to check if `SynthesizeResponse` contains `error` field. +// If it does you can print its `code` and `description`. +// No `error` field in `SynthesizeResponse` means everything worked fine and its `audio` contains byte `content` +// that can be appended to received audio samples with `sample_rate_hertz` sampling frequency in hertz. +// When receiving `SynthesizeResponse` with `audio` you have to check if its `end_of_stream` flag is set to true. +// When it is set to true it means service has finished synthesis and you can save your wave file with received synthesized audio content. service TTS { // Returns audio signal with synthesized speech, given text and optional configuration. From b704e8f32b2df5439c9439d1836f96865ffe100f Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Tue, 2 Jul 2019 14:44:39 +0200 Subject: [PATCH 4/6] config change --- cpp/libtribune-client/tribune_client.cpp | 3 +-- cpp/libtribune-client/tribune_client.h | 7 ++----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/libtribune-client/tribune_client.cpp b/cpp/libtribune-client/tribune_client.cpp index 3fb350c..c2ef5ba 100644 --- a/cpp/libtribune-client/tribune_client.cpp +++ b/cpp/libtribune-client/tribune_client.cpp @@ -1,7 +1,6 @@ #include #include -#include "tribune_tts.grpc.pb.h" #include "tribune_client.h" @@ -15,7 +14,7 @@ SynthesizeRequest build_request(const TribuneClientConfig& config, const std::st SynthesizeRequest request; request.set_text(text); request.mutable_config()->set_sample_rate_hertz(config.sample_rate_hertz); - request.mutable_config()->set_use_opus(config.use_opus); + request.mutable_config()->set_encoding(config.encoding); return request; } diff --git a/cpp/libtribune-client/tribune_client.h b/cpp/libtribune-client/tribune_client.h index 5ac6019..2cdd773 100644 --- a/cpp/libtribune-client/tribune_client.h +++ b/cpp/libtribune-client/tribune_client.h @@ -1,19 +1,16 @@ #ifndef __TRIBUNE_CLIENT_H__ #define __TRIBUNE_CLIENT_H__ +#include "tribune_tts.grpc.pb.h" namespace techmo { namespace tribune { -enum AudioEncoding{ - LINEAR16, OGG_OPUS -}; - struct TribuneClientConfig { std::string session_id = ""; // Session ID to be passed to the service. If not specified, the service will generate a default session ID itself. // Session ID is the best way to match log's from client application with these on server side. int grpc_timeout = 0; // Timeout in milliseconds used to set gRPC deadline - how long the client is willing to wait for a reply from the server. unsigned int sample_rate_hertz = 0; // Sample rate in Hz of synthesized audio. If set to 0, the service will use voice's original sample rate. - AudioEncoding encoding = AudioEncoding::OGG_OPUS; + AudioEncoding encoding = AudioEncoding::LINEAR16; }; struct TribuneAudioData { From f7426062e7435d149cbd435cda1d93f7d4d67ae4 Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Tue, 2 Jul 2019 15:02:05 +0200 Subject: [PATCH 5/6] reverting some changes --- python/wave_saver.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/wave_saver.py b/python/wave_saver.py index 7843c94..afdb126 100755 --- a/python/wave_saver.py +++ b/python/wave_saver.py @@ -32,10 +32,6 @@ def save(self, filename): w.setparams(params) w.writeframes(self.buffer) - def save_raw(self, filename): - with open(filename, 'w+b') as f: - f.write(self.buffer) - def load(self, filename): with wave.open(filename, 'r') as wr: self.buffer = wr.readframes(wr.getnframes()) From 5ec2c974eb23d1be15309b73f4c7769f8916d24b Mon Sep 17 00:00:00 2001 From: jskrzypek Date: Tue, 2 Jul 2019 18:05:24 +0200 Subject: [PATCH 6/6] more changes --- README.md | 7 +++++++ cpp/tribune-client/main.cpp | 20 ++++---------------- cpp/tribune-client/wave-utils.cpp | 13 +++++++++++++ cpp/tribune-client/wave-utils.h | 6 +++++- python/call_synthesize.py | 2 +- python/{saver_factory.py => file_saver.py} | 4 ++-- 6 files changed, 32 insertions(+), 20 deletions(-) rename python/{saver_factory.py => file_saver.py} (92%) diff --git a/README.md b/README.md index ec597af..a4d6130 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,10 @@ We provide sample TTS Client written in: - C++ in `cpp` (accepts text to be synthesized as a command line string), - Python in `python` (accepts text to be synthesized as a command line string or as a content of given text file). By default it saves "TechmoTTS.wav" file with received synthesized audio content. To use it, you have to specify provided by us service IP address and port using `--service-address` option with string in form "address:port". + +TTS supports following SSML tags: +`text` Change of volume. Parses numeric value (+ and -) +`text` Change of rate without change of pitch. Parses numeric value (+ and -) +`text` Change of pitch. Parses numeric value (+ and -) +`` Inserts silence. Parses numeric value in seconds or milliseconds ("3s", "750ms") +Prosody tags can be combined together, example: `text`. diff --git a/cpp/tribune-client/main.cpp b/cpp/tribune-client/main.cpp index d06b133..8d0ae92 100644 --- a/cpp/tribune-client/main.cpp +++ b/cpp/tribune-client/main.cpp @@ -62,28 +62,16 @@ int main(int argc, const char *const argv[]) { config.grpc_timeout = userOptions["grpc-timeout"].as(); config.sample_rate_hertz = sample_rate_hertz; const auto encoding = userOptions["audio-encoding"].as(); - if(encoding == "LINEAR16") { - config.encoding = techmo::tribune::AudioEncoding::LINEAR16; - } - else if(encoding == "OGG_OPUS") { - config.encoding = techmo::tribune::AudioEncoding::OGG_OPUS; - } - else { + const std::vector allowedEncodings{ "LINEAR16", "OGG_OPUS" }; + if(std::find(allowedEncodings.begin(), allowedEncodings.end(), encoding) == allowedEncodings.end()){ throw std::runtime_error("Unknown audio encoding: " + encoding); } - + techmo::tribune::AudioEncoding_Parse(encoding, &config.encoding); techmo::tribune::TribuneClient tribune_client{ userOptions["service-address"].as() }; const auto audio_data = tribune_client.Synthesize(config, userOptions["text"].as()); - if(config.encoding == techmo::tribune::AudioEncoding::LINEAR16) { - WriteWaveFile(userOptions["out-path"].as(), audio_data.sample_rate_hertz, audio_data.audio_bytes); - } - else { - std::fstream file(userOptions["out-path"].as(), std::ios::binary | std::ios::trunc | std::ios::out); - file.write(audio_data.audio_bytes.data(), audio_data.audio_bytes.size()); - file.flush(); - } + WriteFile(userOptions["out-path"].as(), audio_data.sample_rate_hertz, config.encoding, audio_data.audio_bytes); } catch (const std::exception &e) { std::cerr << e.what() << std::endl; diff --git a/cpp/tribune-client/wave-utils.cpp b/cpp/tribune-client/wave-utils.cpp index 6e86aba..e444765 100644 --- a/cpp/tribune-client/wave-utils.cpp +++ b/cpp/tribune-client/wave-utils.cpp @@ -43,3 +43,16 @@ void WriteWaveFile(const std::string & wavePath, unsigned int sampleRate, const wave_file.write((char*)&audioBytes[0], audioBytes.size()); wave_file.flush(); } + +void WriteFile(const std::string & path, unsigned int sampleRate, techmo::tribune::AudioEncoding encoding, const std::string & audioBytes){ + if(encoding == techmo::tribune::AudioEncoding::LINEAR16) { + WriteWaveFile(path, sampleRate, audioBytes); + } + else { + std::fstream file(path, std::ios::binary | std::ios::trunc | std::ios::out); + file.write(audioBytes.data(), audioBytes.size()); + file.flush(); + } + + +} diff --git a/cpp/tribune-client/wave-utils.h b/cpp/tribune-client/wave-utils.h index 9163272..570b11c 100644 --- a/cpp/tribune-client/wave-utils.h +++ b/cpp/tribune-client/wave-utils.h @@ -3,6 +3,7 @@ #include +#include "tribune_client.h" /// WAVE file header structure @@ -34,8 +35,11 @@ struct WAV_DATA /// Reads at given path WAVE file header into WAV_HEADER struct and audio bytes into std::string. WAV_DATA ReadWaveFile(const std::string & wavePath); -/// Writes at given path WAVE file with given sample rate [Hz] and audio bytes as std::string. +/// Writes WAVE file at given path with given sample rate [Hz] and audio bytes as std::string. void WriteWaveFile(const std::string & wavePath, unsigned int sampleRate, const std::string & audioBytes); +/// Writes file at given path depending on encoding. +void WriteFile(const std::string & path, unsigned int sampleRate, techmo::tribune::AudioEncoding encoding, const std::string & audioBytes); + #endif /* __WAVE_UTILS_H__ */ diff --git a/python/call_synthesize.py b/python/call_synthesize.py index f065b37..0aebfce 100644 --- a/python/call_synthesize.py +++ b/python/call_synthesize.py @@ -2,7 +2,7 @@ import tribune_tts_pb2_grpc import grpc import os -from saver_factory import SaverFactory +from file_saver import SaverFactory def call_synthesize(args, text): diff --git a/python/saver_factory.py b/python/file_saver.py similarity index 92% rename from python/saver_factory.py rename to python/file_saver.py index 4aa98bd..ad0dd51 100644 --- a/python/saver_factory.py +++ b/python/file_saver.py @@ -4,7 +4,7 @@ import os from wave_saver import WaveSaver -class SimpleSaver: +class SimpleFileSaver: """Raw saver for TTS""" buffer = None _framerate = None @@ -30,6 +30,6 @@ class SaverFactory: @staticmethod def get_saver(encoding): if encoding == tribune_tts_pb2.AudioEncoding.Value('OGG_OPUS'): - return SimpleSaver() + return SimpleFileSaver() else: return WaveSaver()