From cec2bebe7b37e5a66d6da2335dbcde9fd664b70c Mon Sep 17 00:00:00 2001 From: ffont Date: Mon, 4 May 2026 10:13:42 +0200 Subject: [PATCH 1/5] Use new essentia anlyzer for consolidated descriptors This means that 2 descriptors related to chords need to dissapear as these are not supported in new analyzer. These were not very relevant to Freesound anyway, and probably very faulty. --- .../descriptors.csv | 2 - _docs/api/source/analysis_docs.rst | 35 ------- _docs/api/source/resources.rst | 4 - freesound/settings.py | 91 ++++++++----------- sounds/models.py | 14 ++- 5 files changed, 50 insertions(+), 96 deletions(-) diff --git a/_docs/api/generate_analysis_documentation/descriptors.csv b/_docs/api/generate_analysis_documentation/descriptors.csv index 79b8f0887..c6214236c 100644 --- a/_docs/api/generate_analysis_documentation/descriptors.csv +++ b/_docs/api/generate_analysis_documentation/descriptors.csv @@ -7,8 +7,6 @@ boominess,ac_boominess,-,numeric,,,,0-100,"Boominess of the audio signal. A boom bpm,ac_tempo,-,integer,,rhythm.bpm,,,BPM value estimated by beat tracking algorithm.,https://en.wikipedia.org/wiki/Tempo bpm_confidence,ac_tempo_confidence,-,numeric,,,,0-1,Confidence score on how reliable the tempo (BPM) estimation is., brightness,ac_brightness,-,numeric,,,,0-100,Brightness of the audio signal. A bright sound is one that is clear/vibrant and/or contains significant high-pitched elements., -chord_count,tonal.chords_count,-,integer,advanced,,set to 0 if not meaningful,,Number of chords in the audio signal based on the number of detected chords by the chord_progression descriptor.,http://essentia.upf.edu/documentation/reference/streaming_ChordsDescriptors.html -chord_progression,tonal.chords_progression,VL,array[string],advanced,tonal.chords_strength,set to 0 if not meaningful,,"Chords estimated from the harmonic pitch class profiles (HPCPs) across the audio signal. Using the pitch classes [""A"", ""A#"", ""B"", ""C"", ""C#"", ""D"", ""D#"", ""E"", ""F"", ""F#"", ""G"", ""G#""], it finds the best-matching major or minor triad and outputs a time-varying chord sequence as a sequence of labels (e.g. A#, Bm). Note, chords are major if no minor symbol.",http://essentia.upf.edu/documentation/reference/streaming_ChordsDetection.html decay_strength,sfx.strongdecay,-,numeric,advanced,,,,Rate at which the audio signal's energy decays (i.e. how quickly it decreases) after the initial attack. It is computed from a non-linear combination of the signal's energy and its temporal centroid (the balance point of the signal's absolute amplitude).,https://essentia.upf.edu/reference/streaming_StrongDecay.html depth,ac_depth,-,numeric,,,,0-100,Depth of the audio signal. A deep sound is one that conveys the sense of having been made far down below the surface of its source., dissonance,lowlevel.dissonance,mean,numeric,,,,,Sensory dissonance of the audio signal given its spectral peaks.,http://essentia.upf.edu/documentation/reference/streaming_Dissonance.html diff --git a/_docs/api/source/analysis_docs.rst b/_docs/api/source/analysis_docs.rst index 809e21c57..b399dcc79 100644 --- a/_docs/api/source/analysis_docs.rst +++ b/_docs/api/source/analysis_docs.rst @@ -651,41 +651,6 @@ beat_loudness :height: 300px -chord_count -------------------------- - -:: - - curl https://freesound.org/api/sounds//analysis/chord_count - -**Description:** Number of chords in the audio signal based on the number of detected chords by the chord_progression descriptor. - -**Type:** integer - -**More information:** http://essentia.upf.edu/documentation/reference/streaming_ChordsDescriptors.html - -**Distribution in Freesound** - - .. image:: _static/descriptors/chord_count.png - :height: 300px - - -chord_progression -------------------------- - -:: - - curl https://freesound.org/api/sounds//analysis/chord_progression - -**Description:** Chords estimated from the harmonic pitch class profiles (HPCPs) across the audio signal. Using the pitch classes ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"], it finds the best-matching major or minor triad and outputs a time-varying chord sequence as a sequence of labels (e.g. A#, Bm). Note, chords are major if no minor symbol. - -**Mode:** VL - -**Type:** array[string] - -**More information:** http://essentia.upf.edu/documentation/reference/streaming_ChordsDetection.html - - decay_strength ------------------------- diff --git a/_docs/api/source/resources.rst b/_docs/api/source/resources.rst index fa361ffc9..1f7b80dca 100644 --- a/_docs/api/source/resources.rst +++ b/_docs/api/source/resources.rst @@ -347,8 +347,6 @@ boominess_ numeric yes Boominess of the audio sig bpm_ integer yes BPM value estimated by beat tracking algorithm. bpm_confidence_ numeric yes Confidence score on how reliable the tempo (BPM) estimation is. brightness_ numeric yes Brightness of the audio signal. A bright sound is one that is clear/vibrant and/or contains significant high-pitched elements. -chord_count_ integer yes Number of chords in the audio signal based on the number of detected chords by the chord_progression descriptor. -chord_progression_ array[string] no Chords estimated from the harmonic pitch class profiles (HPCPs) across the audio signal. Using the pitch classes ["A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#"], it finds the best-matching major or minor triad and outputs a time-varying chord sequence as a sequence of labels (e.g. A#, Bm). Note, chords are major if no minor symbol. decay_strength_ numeric yes Rate at which the audio signal's energy decays (i.e. how quickly it decreases) after the initial attack. It is computed from a non-linear combination of the signal's energy and its temporal centroid (the balance point of the signal's absolute amplitude). depth_ numeric yes Depth of the audio signal. A deep sound is one that conveys the sense of having been made far down below the surface of its source. dissonance_ numeric yes Sensory dissonance of the audio signal given its spectral peaks. @@ -408,8 +406,6 @@ zero_crossing_rate_ numeric yes Zero-crossing rate of the .. _bpm: https://freesound.org/docs/api/analysis_docs.html#bpm .. _bpm_confidence: https://freesound.org/docs/api/analysis_docs.html#bpm-confidence .. _brightness: https://freesound.org/docs/api/analysis_docs.html#brightness -.. _chord_count: https://freesound.org/docs/api/analysis_docs.html#chord-count -.. _chord_progression: https://freesound.org/docs/api/analysis_docs.html#chord-progression .. _decay_strength: https://freesound.org/docs/api/analysis_docs.html#decay-strength .. _depth: https://freesound.org/docs/api/analysis_docs.html#depth .. _dissonance: https://freesound.org/docs/api/analysis_docs.html#dissonance diff --git a/freesound/settings.py b/freesound/settings.py index aa3931afe..bd2b479e9 100644 --- a/freesound/settings.py +++ b/freesound/settings.py @@ -665,7 +665,7 @@ ORCHESTRATE_ANALYSIS_MAX_TIME_CONVERTED_FILES_IN_DISK = 24 * 7 # in hours AUDIOCOMMONS_ANALYZER_NAME = "ac-extractor_v3" -FREESOUND_ESSENTIA_EXTRACTOR_NAME = "fs-essentia-extractor_legacy" +FREESOUND_ESSENTIA_EXTRACTOR_NAME = "fs-essentia-extractor_v1" BIRDNET_ANALYZER_NAME = "birdnet_v1" FSDSINET_ANALYZER_NAME = "fsd-sinet_v1" BST_ANALYZER_NAME = "bst-extractor_v1" @@ -708,23 +708,23 @@ { "name": "amplitude_peak_ratio", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["max_to_total"], + "get_func": lambda d, s: d["sfx.max_to_total"], }, { "name": "beat_count", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm"]["beats_count"], + "get_func": lambda d, s: d["rhythm.beats_count"], "type": AUDIO_DESCRIPTOR_TYPE_INT, }, { "name": "beat_loudness", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm"]["beats_loudness"]["mean"], # Increase precision? + "get_func": lambda d, s: d["rhythm.beats_loudness.mean"], # Increase precision? }, { "name": "beat_times", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm"]["beats_position"], + "get_func": lambda d, s: d["rhythm.beats_position"], "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, "index": False, }, @@ -748,25 +748,10 @@ "analyzer": AUDIOCOMMONS_ANALYZER_NAME, "original_name": "brightness", }, - { - "name": "chord_count", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal"]["chords_count"], - "type": AUDIO_DESCRIPTOR_TYPE_INT, - "condition": condition_music_or_instrument_samples, - }, - { - "name": "chord_progression", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal"]["chords_progression"], - "type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS, - "condition": condition_music_or_instrument_samples, - "index": False, - }, { "name": "decay_strength", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["strongdecay"], + "get_func": lambda d, s: d["sfx.strongdecay"], }, { "name": "depth", @@ -776,12 +761,12 @@ { "name": "dissonance", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["dissonance"]["mean"], + "get_func": lambda d, s: d["lowlevel.dissonance.mean"], }, { "name": "duration_effective", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["effective_duration"]["mean"], + "get_func": lambda d, s: d["sfx.effective_duration"], }, { "name": "dynamic_range", @@ -796,24 +781,24 @@ { "name": "hpcp", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal"]["hpcp"]["mean"], + "get_func": lambda d, s: d["tonal.hpcp.mean"], "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? "index": False, }, { "name": "hpcp_crest", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal"]["hpcp_crest"]["mean"], + "get_func": lambda d, s: d["tonal.hpcp_crest.mean"], }, { "name": "hpcp_entropy", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal"]["hpcp_entropy"]["mean"], + "get_func": lambda d, s: d["tonal.hpcp_entropy.mean"], }, { "name": "inharmonicity", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["inharmonicity"]["mean"], + "get_func": lambda d, s: d["sfx.inharmonicity.mean"], }, { "name": "log_attack_time", @@ -834,7 +819,7 @@ { "name": "mfcc", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["mfcc"]["mean"], + "get_func": lambda d, s: d["lowlevel.mfcc.mean"], "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? "index": False, }, @@ -861,40 +846,40 @@ { "name": "onset_count", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm"]["onset_count"], + "get_func": lambda d, s: d["rhythm.onset_count"], "type": AUDIO_DESCRIPTOR_TYPE_INT, }, { "name": "onset_times", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm"]["onset_times"], + "get_func": lambda d, s: d["rhythm.onset_times"], "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, "index": False, }, { "name": "pitch", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["pitch"]["mean"], + "get_func": lambda d, s: d["lowlevel.pitch.mean"], }, { "name": "pitch_max", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["pitch"]["max"], + "get_func": lambda d, s: d["lowlevel.pitch.max"], }, { "name": "pitch_min", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["pitch"]["min"], + "get_func": lambda d, s: d["lowlevel.pitch.min"], }, { "name": "pitch_salience", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["pitch_salience"]["mean"], + "get_func": lambda d, s: d["lowlevel.pitch_salience.mean"], }, { "name": "pitch_var", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["pitch"]["var"], + "get_func": lambda d, s: d["lowlevel.pitch.var"], }, { "name": "reverbness", @@ -915,7 +900,7 @@ { "name": "silence_rate", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["silence_rate_30dB"]["mean"], + "get_func": lambda d, s: d["lowlevel.silence_rate_30dB.mean"], }, { "name": "single_event", @@ -927,78 +912,78 @@ { "name": "spectral_centroid", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_centroid"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_centroid.mean"], }, { "name": "spectral_complexity", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_complexity"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_complexity.mean"], }, { "name": "spectral_crest", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_crest"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_crest.mean"], }, { "name": "spectral_energy", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_energy"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_energy.mean"], }, { "name": "spectral_entropy", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_entropy"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_entropy.mean"], }, { "name": "spectral_flatness", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_flatness_db"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_flatness_db.mean"], }, { "name": "spectral_rolloff", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_rolloff"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_rolloff.mean"], }, { "name": "spectral_skewness", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_skewness"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_skewness.mean"], }, { "name": "spectral_spread", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["spectral_spread"]["mean"], + "get_func": lambda d, s: d["lowlevel.spectral_spread.mean"], }, { "name": "start_time", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["startFrame"], + "get_func": lambda d, s: d["lowlevel.sound_start_frame"], "transformation": lambda v, d, s: (v * 2048.0) / 44100.0, # Convert from frames to seconds }, { "name": "temporal_centroid", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["temporal_centroid"]["mean"], + "get_func": lambda d, s: d["sfx.temporal_centroid"], }, { "name": "temporal_centroid_ratio", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["tc_to_total"], + "get_func": lambda d, s: d["sfx.tc_to_total"], }, { "name": "temporal_decrease", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["temporal_decrease"]["mean"], + "get_func": lambda d, s: d["sfx.temporal_decrease"], }, { "name": "temporal_skewness", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["temporal_skewness"]["mean"], + "get_func": lambda d, s: d["sfx.temporal_skewness"], }, { "name": "temporal_spread", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["temporal_spread"]["mean"], + "get_func": lambda d, s: d["sfx.temporal_spread"], }, { "name": "tonality", @@ -1014,7 +999,7 @@ { "name": "tristimulus", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx"]["tristimulus"]["mean"], + "get_func": lambda d, s: d["sfx.tristimulus.mean"], "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? "index": False, }, @@ -1026,7 +1011,7 @@ { "name": "zero_crossing_rate", "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel"]["zerocrossingrate"]["mean"], + "get_func": lambda d, s: d["lowlevel.zerocrossingrate.mean"], }, { "name": "has_audio_problems", diff --git a/sounds/models.py b/sounds/models.py index 15af041a8..81cbbe10a 100644 --- a/sounds/models.py +++ b/sounds/models.py @@ -1429,7 +1429,7 @@ def analyze( sounds_logger.info(f"Not sending sound {self.id} to analyzer {analyzer} as is already queued") return sa - def consolidate_analysis(self, no_db_operations=False): + def consolidate_analysis(self, no_db_operations=False, fail_if_missing=False): """ This method post-processes the analysis results of all analyzers for this sound and consolidates them into a new SoundAnalysis object with analyzer name settings.CONSOLIDATED_ANALYZER_NAME. This consolidated analysis contains @@ -1438,6 +1438,7 @@ def consolidate_analysis(self, no_db_operations=False): :param bool no_db_operations: If True, the method only computes the data but does not save it in the DB. Also it returns the consolidated analysis data dictionary instead of the SoundAnalysis object. + :param bool fail_if_missing: If True, the method raises an exception if any analyzer data or descriptor data is missing. """ # Iterate over all descriptors defined in settings.CONSOLIDATED_AUDIO_DESCRIPTORS and obtain/process their values @@ -1458,6 +1459,12 @@ def consolidate_analysis(self, no_db_operations=False): if not analyzer_data: # Analyzer data could not be loaded from file. That means that the analyzer has not analyzed # the sound successfully, skip descriptor + if fail_if_missing: + raise Exception(f"Analyzer data for {analyzer} is missing (sound id: {self.id})") + else: + print( + f"Analyzer data for {analyzer} is missing (sound id: {self.id}), skipping descriptors from this analyzer" + ) continue # Save the data in tmp dict so it is not loaded again in the future if present tmp_analyzers_data[analyzer] = analyzer_data @@ -1486,7 +1493,10 @@ def consolidate_analysis(self, no_db_operations=False): value = get_func(analyzer_data, self) except Exception as e: # If value can't be loaded, continue with next descriptor - print(f"Can't get value for descriptor {name}: {e} (sound id: {self.id})") + if fail_if_missing: + raise Exception(f"Can't get value for descriptor {name}: {e} (sound id: {self.id})") + else: + print(f"Can't get value for descriptor {name}: {e} (sound id: {self.id}), skipping this descriptor") continue if value is not None and type(value) == float and math.isnan(value): From 463f9b0d8401ce6e077abaaf57f7c6367eccec17 Mon Sep 17 00:00:00 2001 From: ffont Date: Mon, 4 May 2026 10:42:08 +0200 Subject: [PATCH 2/5] Remove analysis_files API field We'll no longer use it as with the new essentia extractor we don't compute "frames" files. This commit also removes the "analysis" entry in sound locations as this has been managed by SoundAnalysis objects since these were introduced and we don't need it --- _docs/api/source/resources.rst | 1 - apiv2/serializers.py | 15 +-------------- sounds/models.py | 25 +------------------------ sounds/tests/test_sound.py | 28 ---------------------------- 4 files changed, 2 insertions(+), 67 deletions(-) diff --git a/_docs/api/source/resources.rst b/_docs/api/source/resources.rst index 1f7b80dca..7c0316b78 100644 --- a/_docs/api/source/resources.rst +++ b/_docs/api/source/resources.rst @@ -325,7 +325,6 @@ comments URI yes* The URI of a paginated l num_comments integer yes The number of times the sound was commented. comment URI no The URI to comment the sound. similar_sounds URI no URI pointing to the :ref:`similar-sounds` resource (to get a list of similar sounds). -analysis_files URIs no List of URIs for retrieving files with analysis information for each frame of the sound (see :ref:`analysis-docs`). ========================= ================ ========= ==================================================================================== Additionally, content-based audio descriptors extracted from the sound signal can be used as fields. diff --git a/apiv2/serializers.py b/apiv2/serializers.py index 1d8f2e9b0..784021c0d 100644 --- a/apiv2/serializers.py +++ b/apiv2/serializers.py @@ -42,7 +42,7 @@ + "geotag,is_geotagged,created,license,type,channels,filesize,bitrate," + "bitdepth,duration,samplerate,username,pack,pack_name,download,bookmark,previews,images," + "num_downloads,avg_rating,num_ratings,rate,comments,num_comments,comment,similar_sounds," - + "analysis_files,is_explicit,is_remix,was_remixed,md5,ai_preference" + + "is_explicit,is_remix,was_remixed,md5,ai_preference" ) DEFAULT_FIELDS_IN_PACK_DETAIL = None # Separated by commas (None = all) @@ -168,7 +168,6 @@ class Meta: "num_comments", "comment", "similar_sounds", - "analysis_files", "is_explicit", "score", "is_remix", @@ -317,18 +316,6 @@ def get_images(self, obj): ), } - analysis_files = serializers.SerializerMethodField() - - def get_analysis_files(self, obj): - return { - "essentia_frames": prepend_base( - obj.locations("analysis.frames.url"), request_is_secure=self.context["request"].is_secure() - ), - "essentia_stats": prepend_base( - obj.locations("analysis.statistics.url"), request_is_secure=self.context["request"].is_secure() - ), - } - similar_sounds = serializers.SerializerMethodField() def get_similar_sounds(self, obj): diff --git a/sounds/models.py b/sounds/models.py index 81cbbe10a..36e79b982 100644 --- a/sounds/models.py +++ b/sounds/models.py @@ -837,27 +837,6 @@ def locations(self): ), ), ), - analysis=dict( - base_path=os.path.join(settings.ANALYSIS_PATH, id_folder), - statistics=dict( - path=os.path.join( - settings.ANALYSIS_PATH, - id_folder, - "%d-%s.yaml" % (self.id, settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME), - ), - url=settings.ANALYSIS_URL - + "%s/%d-%s.yaml" % (id_folder, self.id, settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME), - ), - frames=dict( - path=os.path.join( - settings.ANALYSIS_PATH, - id_folder, - "%d-%s_frames.json" % (self.id, settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME), - ), - url=settings.ANALYSIS_URL - + "%s/%d-%s_frames.json" % (id_folder, self.id, settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME), - ), - ), ) def get_preview_abs_url(self): @@ -1229,8 +1208,6 @@ def replace_user_id_in_path(path, old_owner_id, new_owner_id): # Rename related files in disk paths_to_rename = [ self.locations("path"), # original file path - self.locations("analysis.frames.path"), # analysis frames file - self.locations("analysis.statistics.path"), # analysis statistics file self.locations("display.spectral.L.path"), # spectrogram L self.locations("display.spectral.M.path"), # spectrogram M self.locations("display.wave_bw.L.path"), # waveform BW L @@ -1411,7 +1388,7 @@ def analyze( kwargs={ "sound_id": self.id, "sound_path": sound_path, - "analysis_folder": self.locations("analysis.base_path"), + "analysis_folder": os.path.join(settings.ANALYSIS_PATH, str(self.id // 1000)), "metadata": json.dumps( { "duration": self.duration, diff --git a/sounds/tests/test_sound.py b/sounds/tests/test_sound.py index 4d19c247a..f2e404f94 100644 --- a/sounds/tests/test_sound.py +++ b/sounds/tests/test_sound.py @@ -739,34 +739,6 @@ def test_download(self, sendfile): resp = self._get_sound_from_profile_page(self.user) self.assertContains(resp, 'data-num-downloads="1"') - # Similarity link (cached in display and view) - @mock.patch("general.management.commands.similarity_update.Similarity.add", return_value="Dummy response") - def _test_similarity_update(self, cache_keys, expected, request_func, similarity_add, user=None): - # Create a SoundAnalysis object with status OK so "similarity_update" command will pick it up - SoundAnalysis.objects.create( - sound=self.sound, analyzer=settings.FREESOUND_ESSENTIA_EXTRACTOR_NAME, analysis_status="OK" - ) - self.sound.save() - - self._assertCacheAbsent(cache_keys) - - self.client.force_login(self.user) - - # Initial check - self.assertEqual(self.sound.similarity_state, "PE") - self.assertNotContains(request_func(user) if user is not None else request_func(), expected) - self._assertCachePresent(cache_keys) - - # Update similarity - call_command("similarity_update") - similarity_add.assert_called_once_with(self.sound.id, self.sound.locations("analysis.statistics.path")) - self._assertCacheAbsent(cache_keys) - - # Check similarity icon - self.sound.refresh_from_db() - self.assertEqual(self.sound.similarity_state, "OK") - self.assertContains(request_func(user) if user is not None else request_func(), expected) - # Pack link (cached in display and view) def _test_add_remove_pack(self, cache_keys, text, request_func, user=None): self._assertCacheAbsent(cache_keys) From e93fcb9295b0cc934510437465df24d01451e8a2 Mon Sep 17 00:00:00 2001 From: ffont Date: Fri, 8 May 2026 11:09:46 +0200 Subject: [PATCH 3/5] Load more descriptors from essentia analyzer --- freesound/settings.py | 44 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/freesound/settings.py b/freesound/settings.py index ede9e3def..d60404e8b 100644 --- a/freesound/settings.py +++ b/freesound/settings.py @@ -735,13 +735,13 @@ }, { "name": "bpm", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "tempo", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.bpm"], }, { "name": "bpm_confidence", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "tempo_confidence", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.bpm_confidence"], }, { "name": "brightness", @@ -770,8 +770,8 @@ }, { "name": "dynamic_range", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "brightness", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "original_name": lambda d, s: d["lowlevel.loudness_ebu128.loudness_range"], }, { "name": "hardness", @@ -802,19 +802,19 @@ }, { "name": "log_attack_time", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "log_attack_time", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["sfx.logattacktime"], }, { "name": "loopable", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "loop", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.loopable"], "type": AUDIO_DESCRIPTOR_TYPE_BOOL, }, { "name": "loudness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "loudness", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["lowlevel.loudness_ebu128.integrated"], }, { "name": "mfcc", @@ -825,21 +825,21 @@ }, { "name": "note_confidence", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "note_confidence", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.note_confidence"], "condition": condition_instrument_samples, }, { "name": "note_midi", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "note_midi", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.note_midi"], "type": AUDIO_DESCRIPTOR_TYPE_INT, "condition": condition_instrument_samples, }, { "name": "note_name", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "note_name", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.note_name"], "type": AUDIO_DESCRIPTOR_TYPE_STRING, "condition": condition_instrument_samples, }, @@ -987,14 +987,14 @@ }, { "name": "tonality", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "tonality", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.tonality"], "type": AUDIO_DESCRIPTOR_TYPE_STRING, }, { "name": "tonality_confidence", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "tonality_confidence", + "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, + "get_func": lambda d, s: d["fs.tonality_confidence"], }, { "name": "tristimulus", From 7e48c8eb204ba8053240239db65605cc6bbbf89d Mon Sep 17 00:00:00 2001 From: ffont Date: Thu, 11 Jun 2026 17:31:11 +0200 Subject: [PATCH 4/5] Move audio descriptors settings to new file --- freesound/audio_descriptor_settings.py | 390 ++++++++++++++++++++++++ freesound/settings.py | 393 +------------------------ 2 files changed, 393 insertions(+), 390 deletions(-) create mode 100644 freesound/audio_descriptor_settings.py diff --git a/freesound/audio_descriptor_settings.py b/freesound/audio_descriptor_settings.py new file mode 100644 index 000000000..fedda0955 --- /dev/null +++ b/freesound/audio_descriptor_settings.py @@ -0,0 +1,390 @@ +AUDIO_DESCRIPTOR_TYPE_FLOAT = "float" +AUDIO_DESCRIPTOR_TYPE_INT = "int" +AUDIO_DESCRIPTOR_TYPE_BOOL = "bool" +AUDIO_DESCRIPTOR_TYPE_STRING = "string" +AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS = "list_of_strings" +AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY = "float_array" +AUDIO_DESCRIPTOR_TYPE_JSON = "json" # For complex structures +DEFAULT_AUDIO_DESCRIPTOR_TYPE = AUDIO_DESCRIPTOR_TYPE_FLOAT +DEFAULT_AUDIO_DESCRIPTOR_FLOAT_PRECISION = 3 # Number of decimal digits for float audio descriptors + +condition_music_or_instrument_samples = lambda s: s.category_names[0] in ["Music", "Instrument samples"] +condition_instrument_samples = lambda s: s.category_names[0] == "Instrument samples" +condition_sfx_or_soundscapes = lambda s: s.category_names[0] in ["Sound effects", "Soundscapes"] +CONSOLIDATED_ANALYZER_NAME = "consolidated" +CONSOLIDATED_AUDIO_DESCRIPTORS = [ + { + "name": "category", + "analyzer": "bst-extractor_v2", + "original_name": "bst_top_level", + "type": AUDIO_DESCRIPTOR_TYPE_STRING, + }, + { + "name": "subcategory", + "analyzer": "bst-extractor_v2", + "original_name": "bst_second_level", + "type": AUDIO_DESCRIPTOR_TYPE_STRING, + }, + { + "name": "amplitude_peak_ratio", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.max_to_total"], + }, + { + "name": "beat_count", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["rhythm.beats_count"], + "type": AUDIO_DESCRIPTOR_TYPE_INT, + }, + { + "name": "beat_loudness", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["rhythm.beats_loudness.mean"], # Increase precision? + }, + { + "name": "beat_times", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["rhythm.beats_position"], + "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, + "index": False, + }, + { + "name": "boominess", + "analyzer": "ac-extractor_v3", + "original_name": "boominess", + }, + { + "name": "bpm", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.bpm"], + }, + { + "name": "bpm_confidence", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.bpm_confidence"], + }, + { + "name": "brightness", + "analyzer": "ac-extractor_v3", + "original_name": "brightness", + }, + { + "name": "decay_strength", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.strongdecay"], + }, + { + "name": "depth", + "analyzer": "ac-extractor_v3", + "original_name": "depth", + }, + { + "name": "dissonance", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.dissonance.mean"], + }, + { + "name": "duration_effective", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.effective_duration"], + }, + { + "name": "dynamic_range", + "analyzer": "fs-essentia-extractor_v1", + "original_name": lambda d, s: d["lowlevel.loudness_ebu128.loudness_range"], + }, + { + "name": "hardness", + "analyzer": "ac-extractor_v3", + "original_name": "hardness", + }, + { + "name": "hpcp", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["tonal.hpcp.mean"], + "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? + "index": False, + }, + { + "name": "hpcp_crest", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["tonal.hpcp_crest.mean"], + }, + { + "name": "hpcp_entropy", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["tonal.hpcp_entropy.mean"], + }, + { + "name": "inharmonicity", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.inharmonicity.mean"], + }, + { + "name": "log_attack_time", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.logattacktime"], + }, + { + "name": "loopable", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.loopable"], + "type": AUDIO_DESCRIPTOR_TYPE_BOOL, + }, + { + "name": "loudness", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.loudness_ebu128.integrated"], + }, + { + "name": "mfcc", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.mfcc.mean"], + "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? + "index": False, + }, + { + "name": "note_confidence", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.note_confidence"], + "condition": condition_instrument_samples, + }, + { + "name": "note_midi", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.note_midi"], + "type": AUDIO_DESCRIPTOR_TYPE_INT, + "condition": condition_instrument_samples, + }, + { + "name": "note_name", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.note_name"], + "type": AUDIO_DESCRIPTOR_TYPE_STRING, + "condition": condition_instrument_samples, + }, + { + "name": "onset_count", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["rhythm.onset_count"], + "type": AUDIO_DESCRIPTOR_TYPE_INT, + }, + { + "name": "onset_times", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["rhythm.onset_times"], + "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, + "index": False, + }, + { + "name": "pitch", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.pitch.mean"], + }, + { + "name": "pitch_max", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.pitch.max"], + }, + { + "name": "pitch_min", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.pitch.min"], + }, + { + "name": "pitch_salience", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.pitch_salience.mean"], + }, + { + "name": "pitch_var", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.pitch.var"], + }, + { + "name": "reverbness", + "analyzer": "ac-extractor_v3", + "original_name": "reverb", + "type": AUDIO_DESCRIPTOR_TYPE_BOOL, + }, + { + "name": "roughness", + "analyzer": "ac-extractor_v3", + "original_name": "roughness", + }, + { + "name": "sharpness", + "analyzer": "ac-extractor_v3", + "original_name": "sharpness", + }, + { + "name": "silence_rate", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.silence_rate_30dB.mean"], + }, + { + "name": "single_event", + "analyzer": "ac-extractor_v3", + "original_name": "single_event", + "type": AUDIO_DESCRIPTOR_TYPE_BOOL, + "transformation": lambda v, d, s: v if s.category_names[0] not in ["Music", "Soundscapes"] else False, + }, + { + "name": "spectral_centroid", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_centroid.mean"], + }, + { + "name": "spectral_complexity", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_complexity.mean"], + }, + { + "name": "spectral_crest", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_crest.mean"], + }, + { + "name": "spectral_energy", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_energy.mean"], + }, + { + "name": "spectral_entropy", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_entropy.mean"], + }, + { + "name": "spectral_flatness", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_flatness_db.mean"], + }, + { + "name": "spectral_rolloff", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_rolloff.mean"], + }, + { + "name": "spectral_skewness", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_skewness.mean"], + }, + { + "name": "spectral_spread", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.spectral_spread.mean"], + }, + { + "name": "start_time", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.sound_start_frame"], + "transformation": lambda v, d, s: (v * 2048.0) / 44100.0, # Convert from frames to seconds + }, + { + "name": "temporal_centroid", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.temporal_centroid"], + }, + { + "name": "temporal_centroid_ratio", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.tc_to_total"], + }, + { + "name": "temporal_decrease", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.temporal_decrease"], + }, + { + "name": "temporal_skewness", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.temporal_skewness"], + }, + { + "name": "temporal_spread", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.temporal_spread"], + }, + { + "name": "tonality", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.tonality"], + "type": AUDIO_DESCRIPTOR_TYPE_STRING, + }, + { + "name": "tonality_confidence", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["fs.tonality_confidence"], + }, + { + "name": "tristimulus", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["sfx.tristimulus.mean"], + "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? + "index": False, + }, + { + "name": "warmth", + "analyzer": "ac-extractor_v3", + "original_name": "warmth", + }, + { + "name": "zero_crossing_rate", + "analyzer": "fs-essentia-extractor_v1", + "get_func": lambda d, s: d["lowlevel.zerocrossingrate.mean"], + }, + { + "name": "has_audio_problems", + "analyzer": "fs-essentia-problem-detection_v1", + "original_name": "error", + "type": AUDIO_DESCRIPTOR_TYPE_BOOL, + }, + { + "name": "birdnet_detected_class", + "type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS, + "analyzer": "birdnet_v1", + "original_name": "detected_classes", + "transformation": lambda v, d, s: None if v == [] else v, + "condition": condition_sfx_or_soundscapes, + }, + { + "name": "birdnet_detections", + "analyzer": "birdnet_v1", + "type": AUDIO_DESCRIPTOR_TYPE_JSON, + "original_name": "detections", + "transformation": lambda v, d, s: None if v == [] else v, + "condition": condition_sfx_or_soundscapes, + "index": False, + }, + { + "name": "birdnet_detections_count", + "type": AUDIO_DESCRIPTOR_TYPE_INT, + "analyzer": "birdnet_v1", + "original_name": "num_detections", + "condition": condition_sfx_or_soundscapes, + }, + { + "name": "fsdsinet_detected_class", + "type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS, + "analyzer": "fsd-sinet_v1", + "original_name": "detected_classes", + "transformation": lambda v, d, s: None if v == [] else v, + }, + { + "name": "fsdsinet_detections", + "analyzer": "fsd-sinet_v1", + "type": AUDIO_DESCRIPTOR_TYPE_JSON, + "original_name": "detections", + "transformation": lambda v, d, s: None if v == [] else v, + "index": False, + }, + { + "name": "fsdsinet_detections_count", + "type": AUDIO_DESCRIPTOR_TYPE_INT, + "analyzer": "fsd-sinet_v1", + "original_name": "num_detections", + }, +] + +CONSOLIDATED_AUDIO_DESCRIPTORS_ANALYZER_NAMES = list(set([ad["analyzer"] for ad in CONSOLIDATED_AUDIO_DESCRIPTORS])) +AVAILABLE_AUDIO_DESCRIPTORS_NAMES = [desc["name"] for desc in CONSOLIDATED_AUDIO_DESCRIPTORS] diff --git a/freesound/settings.py b/freesound/settings.py index e21138f7a..1a23e036a 100644 --- a/freesound/settings.py +++ b/freesound/settings.py @@ -520,397 +520,10 @@ def load_broad_sound_taxonomy_from_csv(path): BSTV2_ANALYZER_NAME: {}, } -AUDIO_DESCRIPTOR_TYPE_FLOAT = "float" -AUDIO_DESCRIPTOR_TYPE_INT = "int" -AUDIO_DESCRIPTOR_TYPE_BOOL = "bool" -AUDIO_DESCRIPTOR_TYPE_STRING = "string" -AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS = "list_of_strings" -AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY = "float_array" -AUDIO_DESCRIPTOR_TYPE_JSON = "json" # For complex structures -DEFAULT_AUDIO_DESCRIPTOR_TYPE = AUDIO_DESCRIPTOR_TYPE_FLOAT -DEFAULT_AUDIO_DESCRIPTOR_FLOAT_PRECISION = 3 # Number of decimal digits for float audio descriptors - -condition_music_or_instrument_samples = lambda s: s.category_names[0] in ["Music", "Instrument samples"] -condition_instrument_samples = lambda s: s.category_names[0] == "Instrument samples" -condition_sfx_or_soundscapes = lambda s: s.category_names[0] in ["Sound effects", "Soundscapes"] -CONSOLIDATED_ANALYZER_NAME = "consolidated" -CONSOLIDATED_AUDIO_DESCRIPTORS = [ - { - "name": "category", - "analyzer": BSTV2_ANALYZER_NAME, - "original_name": "bst_top_level", - "type": AUDIO_DESCRIPTOR_TYPE_STRING, - }, - { - "name": "subcategory", - "analyzer": BSTV2_ANALYZER_NAME, - "original_name": "bst_second_level", - "type": AUDIO_DESCRIPTOR_TYPE_STRING, - }, - { - "name": "amplitude_peak_ratio", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.max_to_total"], - }, - { - "name": "beat_count", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm.beats_count"], - "type": AUDIO_DESCRIPTOR_TYPE_INT, - }, - { - "name": "beat_loudness", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm.beats_loudness.mean"], # Increase precision? - }, - { - "name": "beat_times", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm.beats_position"], - "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, - "index": False, - }, - { - "name": "boominess", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "boominess", - }, - { - "name": "bpm", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.bpm"], - }, - { - "name": "bpm_confidence", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.bpm_confidence"], - }, - { - "name": "brightness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "brightness", - }, - { - "name": "decay_strength", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.strongdecay"], - }, - { - "name": "depth", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "depth", - }, - { - "name": "dissonance", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.dissonance.mean"], - }, - { - "name": "duration_effective", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.effective_duration"], - }, - { - "name": "dynamic_range", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "original_name": lambda d, s: d["lowlevel.loudness_ebu128.loudness_range"], - }, - { - "name": "hardness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "hardness", - }, - { - "name": "hpcp", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal.hpcp.mean"], - "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? - "index": False, - }, - { - "name": "hpcp_crest", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal.hpcp_crest.mean"], - }, - { - "name": "hpcp_entropy", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["tonal.hpcp_entropy.mean"], - }, - { - "name": "inharmonicity", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.inharmonicity.mean"], - }, - { - "name": "log_attack_time", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.logattacktime"], - }, - { - "name": "loopable", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.loopable"], - "type": AUDIO_DESCRIPTOR_TYPE_BOOL, - }, - { - "name": "loudness", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.loudness_ebu128.integrated"], - }, - { - "name": "mfcc", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.mfcc.mean"], - "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? - "index": False, - }, - { - "name": "note_confidence", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.note_confidence"], - "condition": condition_instrument_samples, - }, - { - "name": "note_midi", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.note_midi"], - "type": AUDIO_DESCRIPTOR_TYPE_INT, - "condition": condition_instrument_samples, - }, - { - "name": "note_name", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.note_name"], - "type": AUDIO_DESCRIPTOR_TYPE_STRING, - "condition": condition_instrument_samples, - }, - { - "name": "onset_count", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm.onset_count"], - "type": AUDIO_DESCRIPTOR_TYPE_INT, - }, - { - "name": "onset_times", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["rhythm.onset_times"], - "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, - "index": False, - }, - { - "name": "pitch", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.pitch.mean"], - }, - { - "name": "pitch_max", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.pitch.max"], - }, - { - "name": "pitch_min", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.pitch.min"], - }, - { - "name": "pitch_salience", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.pitch_salience.mean"], - }, - { - "name": "pitch_var", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.pitch.var"], - }, - { - "name": "reverbness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "reverb", - "type": AUDIO_DESCRIPTOR_TYPE_BOOL, - }, - { - "name": "roughness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "roughness", - }, - { - "name": "sharpness", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "sharpness", - }, - { - "name": "silence_rate", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.silence_rate_30dB.mean"], - }, - { - "name": "single_event", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "single_event", - "type": AUDIO_DESCRIPTOR_TYPE_BOOL, - "transformation": lambda v, d, s: v if s.category_names[0] not in ["Music", "Soundscapes"] else False, - }, - { - "name": "spectral_centroid", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_centroid.mean"], - }, - { - "name": "spectral_complexity", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_complexity.mean"], - }, - { - "name": "spectral_crest", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_crest.mean"], - }, - { - "name": "spectral_energy", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_energy.mean"], - }, - { - "name": "spectral_entropy", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_entropy.mean"], - }, - { - "name": "spectral_flatness", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_flatness_db.mean"], - }, - { - "name": "spectral_rolloff", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_rolloff.mean"], - }, - { - "name": "spectral_skewness", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_skewness.mean"], - }, - { - "name": "spectral_spread", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.spectral_spread.mean"], - }, - { - "name": "start_time", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.sound_start_frame"], - "transformation": lambda v, d, s: (v * 2048.0) / 44100.0, # Convert from frames to seconds - }, - { - "name": "temporal_centroid", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.temporal_centroid"], - }, - { - "name": "temporal_centroid_ratio", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.tc_to_total"], - }, - { - "name": "temporal_decrease", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.temporal_decrease"], - }, - { - "name": "temporal_skewness", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.temporal_skewness"], - }, - { - "name": "temporal_spread", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.temporal_spread"], - }, - { - "name": "tonality", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.tonality"], - "type": AUDIO_DESCRIPTOR_TYPE_STRING, - }, - { - "name": "tonality_confidence", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["fs.tonality_confidence"], - }, - { - "name": "tristimulus", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["sfx.tristimulus.mean"], - "type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision? - "index": False, - }, - { - "name": "warmth", - "analyzer": AUDIOCOMMONS_ANALYZER_NAME, - "original_name": "warmth", - }, - { - "name": "zero_crossing_rate", - "analyzer": FREESOUND_ESSENTIA_EXTRACTOR_NAME, - "get_func": lambda d, s: d["lowlevel.zerocrossingrate.mean"], - }, - { - "name": "has_audio_problems", - "analyzer": "fs-essentia-problem-detection_v1", - "original_name": "error", - "type": AUDIO_DESCRIPTOR_TYPE_BOOL, - }, - { - "name": "birdnet_detected_class", - "type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS, - "analyzer": BIRDNET_ANALYZER_NAME, - "original_name": "detected_classes", - "transformation": lambda v, d, s: None if v == [] else v, - "condition": condition_sfx_or_soundscapes, - }, - { - "name": "birdnet_detections", - "analyzer": BIRDNET_ANALYZER_NAME, - "type": AUDIO_DESCRIPTOR_TYPE_JSON, - "original_name": "detections", - "transformation": lambda v, d, s: None if v == [] else v, - "condition": condition_sfx_or_soundscapes, - "index": False, - }, - { - "name": "birdnet_detections_count", - "type": AUDIO_DESCRIPTOR_TYPE_INT, - "analyzer": BIRDNET_ANALYZER_NAME, - "original_name": "num_detections", - "condition": condition_sfx_or_soundscapes, - }, - { - "name": "fsdsinet_detected_class", - "type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS, - "analyzer": FSDSINET_ANALYZER_NAME, - "original_name": "detected_classes", - "transformation": lambda v, d, s: None if v == [] else v, - }, - { - "name": "fsdsinet_detections", - "analyzer": FSDSINET_ANALYZER_NAME, - "type": AUDIO_DESCRIPTOR_TYPE_JSON, - "original_name": "detections", - "transformation": lambda v, d, s: None if v == [] else v, - "index": False, - }, - { - "name": "fsdsinet_detections_count", - "type": AUDIO_DESCRIPTOR_TYPE_INT, - "analyzer": FSDSINET_ANALYZER_NAME, - "original_name": "num_detections", - }, -] - -CONSOLIDATED_AUDIO_DESCRIPTORS_ANALYZER_NAMES = list(set([ad["analyzer"] for ad in CONSOLIDATED_AUDIO_DESCRIPTORS])) -AVAILABLE_AUDIO_DESCRIPTORS_NAMES = [desc["name"] for desc in CONSOLIDATED_AUDIO_DESCRIPTORS] +# ------------------------------------------------------------------------------- +# Consolidated audio descriptors +from .audio_descriptor_settings import * # noqa: F403 # ------------------------------------------------------------------------------- # Search engine From e121bc51057c126d58aa61d8b34e871faaa13e9d Mon Sep 17 00:00:00 2001 From: ffont Date: Fri, 12 Jun 2026 18:02:28 +0200 Subject: [PATCH 5/5] Get bpm from sound description if possible --- freesound/audio_descriptor_settings.py | 4 +-- sounds/models.py | 39 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/freesound/audio_descriptor_settings.py b/freesound/audio_descriptor_settings.py index fedda0955..ae540f018 100644 --- a/freesound/audio_descriptor_settings.py +++ b/freesound/audio_descriptor_settings.py @@ -56,12 +56,12 @@ { "name": "bpm", "analyzer": "fs-essentia-extractor_v1", - "get_func": lambda d, s: d["fs.bpm"], + "get_func": lambda d, s: s.estimate_bpm_from_metadata() or d["fs.bpm"], }, { "name": "bpm_confidence", "analyzer": "fs-essentia-extractor_v1", - "get_func": lambda d, s: d["fs.bpm_confidence"], + "get_func": lambda d, s: 1.0 if s.estimate_bpm_from_metadata() else d["fs.bpm_confidence"], }, { "name": "brightness", diff --git a/sounds/models.py b/sounds/models.py index d93db55e1..3647e092c 100644 --- a/sounds/models.py +++ b/sounds/models.py @@ -25,6 +25,7 @@ import math import os import random +import re import zlib from collections import Counter from urllib.parse import quote @@ -1654,6 +1655,44 @@ def get_second_level_category_search_url(self): else: return None + def estimate_bpm_from_metadata(self, min_bpm=25, max_bpm=300): + """ + Estimate the bpm of a sound by looking at its description, tags and name. + :param min_bpm: minimum bpm + :param max_bpm: maximum bpm + :return: estimated bpm (int) or 0 if bpm could not be estimated + """ + bpm_candidates = list() + + # Find sequences like 120bpm, bpm120, 120 bpm or bpm 120 in all fields + description = self.description.lower() + name = self.original_filename.lower() + tags = [t.lower() for t in self.get_sound_tags()] + for candidate in re.findall(r"\d+[\s]?bpm", description + " " + name + " " + " ".join(tags)) + re.findall( + r"bpm[\s]?\d+", description + " " + name + " " + " ".join(tags) + ): + try: + bpm = int(candidate.replace("bpm", "").replace(" ", "")) + if min_bpm <= bpm <= max_bpm: + bpm_candidates.append(bpm) + except ValueError: + continue + + # Find tags corresponding to single numbers and in a range + for tag in tags: + try: + bpm = int(tag) + if min_bpm <= bpm <= max_bpm: + bpm_candidates.append(bpm) + except ValueError: + continue + + if not bpm_candidates: + return 0 + + # Return the most common candidate + return Counter(bpm_candidates).most_common(1)[0][0] + class Meta: ordering = ("-created",) indexes = [