Yeah.. I have all these music files as mp3 format.. with varying id3 tags, some are similar such as 'Nine Inch Nails' and 'Nine inch Nails', so, I am working on consolidating them..
So far I have some scripts that use String::Similarity::Group to find close matches.. It's working great.. I print sh commands to stdout for revission.. Thing is.. I don't know if the artist names and album names etc are authoritative or not.. So I'm trying to get a list of what *would* be authoritative.
It seems freedb should have rpc calls, for example, such as WordPress(WordPress::XMLRPC )
Here's some scripts if you care to look.. I'm sure you realize these are hacks..
id3normalizer.pl
#!/usr/bin/perl use strict; use vars qw($VERSION @IN %_IN @FILES $ABS_CACHE $BASE @TAG_NAMES $DATA +); use LEOCHARRE::CLI2 ':all' ,'qsrt:a:Sz:x'; use LEOCHARRE::Dir ':all'; use Cache::File; use MP3::Tag; #use Smart::Comments '###'; $VERSION = sprintf "%d.%02d", q$Revision: 1.3 $ =~ /(\d+)/g; @TAG_NAMES = qw/artist album track title/; # the tags we care about sub say { $opt_q and return 1; warn "@_\n"; 1 } BEGIN { $BASE = "$ENV{HOME}/.id3normalizer"; -d $BASE or mkdir $BASE or die("Cant make '$BASE', $!"); $ABS_CACHE = "$BASE/cache"; $_IN{"$ENV{HOME}/music"}++; ### base : $BASE ### cache : $ABS_CACHE } my $cache = Cache::File->new( cache_root => $ABS_CACHE ); debug( "Cache initialized"); # @IN is where to look for files if ( argv_dirs_count() ){ my @dirs = argv_dirs(); debug("adding @dirs"); $_IN{$_}++ for @dirs; } @IN = keys %_IN; debug("in: @IN"); debug("getting list of mp3s.."); @FILES = abs_mp3s(); my $count = scalar @FILES; say("Got $count mp3 files.\n"); $opt_r and DATA_reload() and exit; # load DATA the main data by tags/files DATA_load(); stats(); find_by_tag_value(); show_file_info(); show_all_unique_tag(); # like all artists, or whatever run_similarity_report(); suggest_for_missing_tags(); die("No args\n"); # by using simplest cache.. #for my $tag_name (@TAG_NAMES){ # my $data = _files_by_tag($tag_name); # $DATA{$tag_name} = $data; #} sub show_file_info { argv_files_count() or return; my @files = grep { /mp3$/i } argv_files(); debug(); for my $abs (@files){ _show_tags_for_file($abs); } exit; } sub show_all_unique_tag { $opt_t or return; $opt_a and return; # not if we have a att val my @uniques = _unique_tag_values($opt_t); my $count = scalar @uniques; print "$_\n" for @uniques; printf "\ntotal: %s\n", $count; exit; } #, ok.. how many of each? sub stats { $opt_s or return; print "cache: $ABS_CACHE\n"; print "path: $_\n" for @IN; printf "total mp3s: %s\n\n", (scalar @FILES); for my $tag_name (keys %$DATA){ my @uniques = _unique_tag_values( $tag_name ); my $count = scalar @uniques; my $unknowns = _files_by_tag_value($tag_name, 'unknown'); my $ucount = scalar @$unknowns; printf "Unique %s: %s\nFiles missing tag value: %s\n\n", $tag_name, $count, $ucount; } exit; } sub _show_tags_for_file { my $abs = shift;# must be abs path my $info = _id3_info($abs) or next; print "$abs\n"; for my $tag (keys %$info){ printf "%s: '%s'\n", $tag, $info->{$tag}; } print "\n"; } sub suggest_for_missing_tags { # as sh commands $opt_x or return; require String::Prettify; TAG: for my $tag_name (keys %$DATA){ # skip track and title tags.. ($tag_name eq 'title') or ($tag_name eq 'track') and next TAG; my @unknowns = _files_by_tag_value($tag_name, 'unknown'); my $ucount = scalar @unknowns; $ucount or next; printf "# Tag: %s\n# files missing tag: %s\n# suggestions:\n", $ +tag_name, $ucount; for my $abs (@unknowns){ my $suggestion=''; if ($tag_name eq 'artist'){ # are there files in here with artist tag already? # if inside abs music, waht's the next dir.. # such as ~/music/element/.. # what does element hold... ?? # and prettify that. BASEDIR: for my $basedir (@IN){ $abs=~/^$basedir/ or next BASEDIR; # can we get a subdir... $abs=~/^$basedir\/([^\/]+)\// or warn("cant get subdir for $abs") and next BASEDIR; my $subdir = $1; $suggestion = String::Prettify::prettify($subdir); + } } # end if artist if ($tag_name eq 'album'){ BASEDIR: for my $basedir (@IN){ $abs=~/^$basedir/ or next BASEDIR; # can we get a subdir... $abs=~/^$basedir\/[^\/]+\/([^\/]+)\// or warn("cant get subdir for $abs") and next BASEDIR; my $subdir = $1; # sometimes the album subdirs are named like.. # Artist Name - album name # if so... if( $subdir=~m/[a-zA-Z0-9 ]+[\-](.+)/ ){ $subdir = $1; } $suggestion = String::Prettify::prettify($subdir); } } #end if album printf qq{id3tag --%s="%s" "%s"\n}, $tag_name, $suggestion, $ +abs; } print "\n\n"; } exit; } sub find_by_tag_value { $opt_t and $opt_a or return; debug("Looking for files tag '$opt_t' value '$opt_a'\n"); my @files = _files_by_tag_value($opt_t, $opt_a); my $count = scalar @files; debug("Got $count"); print "$_\n" for @files; print "\n"; exit; } sub run_similarity_report { $opt_S or return; $opt_z ||= '0.7'; my @report; require String::Similarity::Group; require YAML; TAG: for my $tag_name (qw/artist album/){ #should do title also?? n +ot as important ? debug("doing $tag_name"); my @uniques = _unique_tag_values($tag_name); my $count = scalar @uniques; debug("unique count: $count, grouping.."); my @groups = String::Similarity::Group::groups( $opt_z, \@unique +s ); my $gcount = scalar @groups; debug("count of groups: $gcount"); # ok, we need to have the list of files.. GROUP: for my $group ( @groups ){ my $groupdata = { tagname => $tag_name, tagvalues => $group, }; VALUE: for my $similar_tag_value ( @$group ){ my @files = _files_by_tag_value( $tag_name, $similar_tag_v +alue ); my $count = scalar @files; $groupdata->{tagvalue}->{$similar_tag_value}->{files} = \@ +files; $groupdata->{tagvalue}->{$similar_tag_value}->{count} = sc +alar @files; } # which val has highest count ? my @order = sort { $groupdata->{tagvalue}->{$b}->{count} <=> $groupdata->{tag +value}->{$a}->{count} } @$group; $groupdata->{tagvalues_by_order_of_occurrence} = \@order; $groupdata->{tagvalue_with_highest_occurrence} = $order[0]; push @report, $groupdata; } } my $out = YAML::Dump(@report); print $out; exit; } # unused.. # do via: _files_by_tag_value( $tag_name, 'unknown' ); exit; # get unique values of a tag.. # for example 'artist', what are the unique artist names found? etc sub _unique_tag_values { my $tag_name = shift; $tag_name or die; # if artist, for example, how many? my @all =( sort keys %{$DATA->{$tag_name}} ); wantarray ? (@all) : \@all; } # if we want all the files that have artist= u2 # ALSO... if the tag value is 'unknown' !!! :-) sub _files_by_tag_value { my ($tag_name, $value) = @_; $value or die; my $data = $DATA->{$tag_name}->{$value}; $data or debug("Had nothing for '$tag_name', value '$value'") and $data=[]; wantarray ? (@$data) : $data; } =pod $DATA = { artist => { u2 => [ files ], unknown => [ files ], }, track => { 1 => [ files ], 2 => [ files ], }, title => { 'fire' => [ files ], unknown => [ files ], }, album => { 'self titled' => [ files ], unknown => [ files ], }, } =cut # this analizes all the mp3s on disk and stores # by tags, which files are that.. # so that you can ask.. what are the artists present, # which files have that artist # whichfiles do not have a artis track.. etc. sub DATA_load { debug(); unless( $DATA = $cache->thaw( 'DATA' ) ){ say("Reloading DATA..\n"); TAG: for my $tag_name (@TAG_NAMES){ my $data = _files_by_tag( $tag_name ); $DATA->{$tag_name} = $data; } $cache->freeze( DATA => $DATA, 'never' ); # every day?? } $DATA; } sub DATA_reset { $cache->clear('DATA') ; 1 } sub DATA_reload { say("Reloading DATA, scanning dirs for files, etc..\nThis may take +a while..\n\n"); DATA_reset(); DATA_load(); say("Done.\n"); } sub _files_by_tag { # make hash, each key is tag info, each val is a a +rray of files # so.. like.. # { # $artist_name => [ files] # $artist_name_2 => [ files ] my $tag_name = shift; $tag_name or die; debug("loading $tag_name.."); my $data = {}; for my $abs ( @FILES ){ ### Working===[%] my $tag_value; # by storing path/tag individual values.. # $tag_value = _abs_to_tag( $abs, $tag_name );# WAY SLOW # by storing hashref of info.. my $info = _id3_info($abs) or next; $tag_value = $info->{$tag_name}; #$tag_value ||= "$tag_name\_unknown"; push @{$data->{$tag_value}}, $abs; } debug("done loading $tag_name"); $data; } sub _id3_info { my $abs = shift; $abs or die; my $info; unless ( $info = $cache->thaw( "_id3_info_$abs") ){ debug("loading MP3::Tag..."); my $tag = mp3_tag( $abs ) or return; my $_info = $tag->autoinfo; unless( $_info ){ warn("Cant get autoinfo for $abs"); $_info = {}; } for my $tag_name (@TAG_NAMES){ $info->{$tag_name} = ($_info->{$tag_name} || 'unknown'); + } $cache->freeze( "_id3_info_$abs", $info, 'never' ); } $info; } sub abs_mp3s { #File::Find::Rule->new; ( split( /\n/, `find @IN -type f -iname "*.mp3"`) ) } sub mp3_tag { my $abs = shift; $abs or die; my $tag; #unless( $tag = $cache->thaw( "mp3_tag_$abs" ) ){ $tag = MP3::Tag->new($abs) or warn("cant instance MP3::Tag for '$abs'") and return; #$cache->freeze( "mp3_tag_$abs", $tag, '10 minutes' ); #} $tag; } =pod # NOPE _ THIS IS SO SLOW # ANOTHER WAY... would it be faster to store abs => tag data? # usage: # my $artist = _abs_to_tag( './file.mp3', 'artist' ); sub _abs_to_tag { my($abs,$tag) = @_; $tag or die; my $value; unless( $value = $cache->get( "$tag\_$abs" ) ){ my $tag = mp3_tag( $abs ) or return; my $info = $tag->autoinfo or return; $value = $info->{$tag}; $cache->set("$tag\_$abs", $value, 'never'); } $value; } =cut exit; sub usage { qq{$0 [OPTION].. [DIR].. -d debug -h help -v version -q quiet -r reload/update -s stats -t string tag name -a string value -S run similarity test for main tags, print as YAML -z float similarity threshold -x suggest for missing tags, shows sh commands to stdout Any DIR arguments will be interpreted as the directory to look up mp3s + in. By default we do look inside ~/music To clear cache, delete $ABS_CACHE dir. If you want to find all files with artist=unknown $0 -t artist -a unknown If you want to see all artists $0 -t artist If you want to see the id3 tag info for files.. $0 ./file.mp3 If you want to see which artists or albums etc are similar: $0 -t artist | xargs --delimiter=\\\\n gbs }} /c> <p>id3normalizer_parse_report.pl : <c> #!/usr/bin/perl use strict; use vars qw($VERSION); use LEOCHARRE::CLI2 ':all', 'm:s'; use LEOCHARRE::Dir ':all'; use YAML; $VERSION = sprintf "%d.%02d", q$Revision: 1.3 $ =~ /(\d+)/g; my $abs_report = $ARGV[0]; $abs_report and -f $abs_report or die("missing report"); $opt_m ||= 2; debug("mininum diff count of occurrence to deem important: $opt_m"); my @issues = YAML::LoadFile($abs_report); ISSUE: for my $issue (@issues){ my $tagname = $issue->{tagname}; # skip title $tagname eq 'title' and next ISSUE; my @similar_values = @{$issue->{tagvalues}}; my $chosen_value = $issue->{tagvalue_with_highest_occurrence}; my @occurrence_counts; for my $val ( @similar_values ){ my $count = $issue->{tagvalue}->{$val}->{count}; push @occurrence_counts, $count; } @occurrence_counts = sort { $b <=> $a } @occurrence_counts; # judge occurrence counts.. my $judged_important = 0; my $diff = ($occurrence_counts[0] - $occurrence_counts[1]); if ( $diff >= $opt_m ){ $judged_important = 1; } $opt_s and $judged_important or next ISSUE; debug("important $judged_important"); printf "# %s\n", ($judged_important ? 'IMPORTANT' : '(ignore)'); printf "# tag: %s\n", $tagname; print "# similar values:"; (printf " '%s'", $_ ) for @similar_values; print "\n"; print "# occurrence counts: @occurrence_counts\n"; print "# highest occurring: $chosen_value\n"; if ( $judged_important ){ print "# suggested: \n"; SUGGEST: for my $wrong_value (@similar_values){ $wrong_value eq $chosen_value and next SUGGEST; # skip if t +his is the same as highest my @files = @{$issue->{tagvalue}->{$wrong_value}->{files}}; my $escaped_chosen_value = $chosen_value; $escaped_chosen_value =~s/"/\\"/g; for my $abs (@files){ printf qq{id3tag --%s="%s" "%s"\n}, $tagname, $escaped_cho +sen_value, $abs; } } } print "\n\n"; } exit; sub usage { qq{$0 [OPTION].. FILE -d debug -h help -v version -m number minimum occurrence count diff between highest and next. +. to deem important -s skip non important, where -m doesnt happen true Usage: $0 ./report.yml prints sh commands suggested to stdout To generate report.. id3normalizer.pl -S > ./report.yml id3normalizer_parse_report.pl ./report.yml > suggested_commands.sh }} __END__ =pod =head1 NAME =head1 DESCRIPTION =head1 OPTIONS =head1 USAGE =head2 Usage Examples =head1 AUTHOR Leo Charre leocharre at cpan dot org =head1 SEE ALSO =head1 LICENSE This package is free software; you can redistribute it and/or modify i +t under the same terms as Perl itself, i.e., under the terms of the " +Artistic License" or the "GNU General Public License". =head1 DISCLAIMER This package is distributed in the hope that it will be useful, but WI +THOUT ANY WARRANTY; without even the implied warranty of MERCHANTABIL +ITY or FITNESS FOR A PARTICULAR PURPOSE. See the "GNU General Public License" for more details. =cut
So.. using these.. I would..
perl id3normalizer.pl -r; # update perl id3normalizer.pl -S > report.yml ; # save some data about what we see perl idenormalizer_parse_report.pl -m 1 -s ./report.yml > suggested_changes.sh
In reply to Re^2: any suggestions for freedb search queries?
by leocharre
in thread any suggestions for freedb search queries?
by leocharre
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |