#!/bin/gawk -f # Convert slovnyk dictionaries to stardict format. # Input file is in csv format (separated by ,) with word in # third postion and translation in fourth position. # eq - smth,smth,"word","translation" # there are 2 notes about converting process # 1 - if you have more than 1 same word in dictionary # converter split all tranlations of this word in one and # separate them by , # 2 - all word (no translations) will be converted to lowercase # 3 - you should run script in non UTF8 environment beside # LC_COLLATE for sorting and LC_CTYPE for lowercase # it's due to lenght function return number of symbols # and not bytes in utf8 # LC_ALL=C LC_COLLATE=be_BY.UTF8 LC_CTYPE=be_BY.UTF8 slovnyktodict.awk -v dictname=slovnyk_be-by_en-us slovnyk_be-by_en-us.csv # will create slovnyk_be-by_en-us.{dict,idx,ifo} from slovnyk_be-by_en-us.csv # # you can send questions to alex.murugin@gmail.com # This function (parse_csv) is in the public domain. # For more information email LoranceStinson+csv@gmail.com. # Or see http://lorance.freeshell.org/csv/ function parse_csv(string,csv,sep,quote,escape,newline,trim, fields,pos,strtrim) { # Make sure there is something to parse. if (length(string) == 0) return 0; string = sep string; # The code below assumes ,FIELD. fields = 0; # The number of fields found thus far. while (length(string) > 0) { # Remove spaces after the separator if requested. if (trim && substr(string, 2, 1) == " ") { if (length(string) == 1) return fields; string = substr(string, 2); continue; } strtrim = 0; # Used to trim quotes off strings. # Handle a quoted field. if (substr(string, 2, 1) == quote) { pos = 2; do { pos++ if (pos != length(string) && substr(string, pos, 1) == escape && (substr(string, pos + 1, 1) == quote || substr(string, pos + 1, 1) == escape)) { # Remove escaped quote characters. string = substr(string, 1, pos - 1) substr(string, pos + 1); } else if (substr(string, pos, 1) == quote) { # Found the end of the string. strtrim = 1; } else if (newline && pos >= length(string)) { # Handle embedded newlines if requested. if (getline == -1) { csverr = "Unable to read the next line."; return -1; } string = string newline $0; } } while (pos < length(string) && strtrim == 0) if (strtrim == 0) { csverr = "Missing end quote."; return -2; } } else { # Handle an empty field. if (length(string) == 1 || substr(string, 2, 1) == sep) { csv[fields] = ""; fields++; if (length(string) == 1) return fields; string = substr(string, 2); continue; } # Search for a separator. pos = index(substr(string, 2), sep); # If there is no separator the rest of the string is a field. if (pos == 0) { csv[fields] = substr(string, 2); fields++; return fields; } } # Remove spaces after the separator if requested. if (trim && pos != length(string) && substr(string, pos + strtrim, 1) == " ") { trim = strtrim # Count the number fo spaces found. while (pos < length(string) && substr(string, pos + trim, 1) == " ") { trim++ } # Remove them from the string. string = substr(string, 1, pos + strtrim - 1) substr(string, pos + trim); # Adjust pos with the trimmed spaces if a quotes string was not found. if (!strtrim) { pos -= trim; } } # Make sure we are at the end of the string or there is a separator. if ((pos != length(string) && substr(string, pos + 1, 1) != sep)) { csverr = "Missing separator."; return -3; } # Gather the field. csv[fields] = substr(string, 2 + strtrim, pos - (1 + strtrim * 2)); fields++; # Remove the field from the string for the next pass. string = substr(string, pos + 1); } return fields; } # get this function from dictgen.php (stardict-tools) function stardict_nbo(string) { s1=and(string,255); string=rshift(string,8); s2=and(string,255); string=rshift(string,8); s3=and(string,255); string=rshift(string,8); s4=and(string,255); string=rshift(string,8); return sprintf("%c", s4) sprintf("%c", s3) sprintf("%c", s2) sprintf("%c", s1); } BEGIN { ORS=""; ex=0; if (dictname == "" ){ print "Please specify dictionary name\n"; ex=1; exit 1; } if ( ( system("[ -a " dictname ".ifo ] || [ -a " dictname ".idx ] || [ -a " dictname ".dict ]") ) == 0) { print "Some of destination files already exist\n"; ex=1; exit 1; } } { if ( parse_csv($0, csv, ",", "\"", "\"", "\\n", 1) == 4) { csv[2]=tolower(csv[2]); if (dict[csv[2]]) dict[csv[2]]=dict[csv[2]]", "; dict[csv[2]]=dict[csv[2]]csv[3]; } } END { if (ex==1) {exit 1;} cnt=asorti(dict,orig); if ( cnt == 0) { print "Empty dictionary\n"; exit 1; } pos=0; for (z=1;z<=cnt;++z) { len=length(dict[orig[z]]); print orig[z] sprintf("%c", 0) stardict_nbo(pos) stardict_nbo(len) >dictname".idx"; print dict[orig[z]] >dictname".dict"; pos=pos+len; } close(dictname".idx"); com= "stat --printf=\"%s\" " dictname ".idx"; com| getline size; print "StarDict's dict ifo file\nversion=2.4.2\nwordcount="cnt"\nidxfilesize="size"\nbookname="dictname"\nsametypesequence=m\n">dictname".ifo"; }