#!/bin/gawk -f

# Convert slovnyk dictionaries to stardict format.
# Input file is in csv format (separated by ,) with word in
# third postion and translation in fourth position.
# eq -  smth,smth,"word","translation"
# there are 2 notes about converting process
# 1 - if you have more than 1 same word in dictionary
#     converter split all tranlations of this word in one and
#     separate them by ,
# 2 - all word (no translations) will be converted to lowercase
# 3 - you should run script in non UTF8 environment beside
#     LC_COLLATE for sorting and LC_CTYPE for lowercase
#     it's due to lenght function return number of symbols 
#     and not bytes in utf8
# LC_ALL=C LC_COLLATE=be_BY.UTF8 LC_CTYPE=be_BY.UTF8 slovnyktodict.awk -v dictname=slovnyk_be-by_en-us slovnyk_be-by_en-us.csv
# will create slovnyk_be-by_en-us.{dict,idx,ifo} from slovnyk_be-by_en-us.csv
#
# you can send questions to alex.murugin@gmail.com

# This function (parse_csv) is in the public domain.
# For more information email LoranceStinson+csv@gmail.com.
# Or see http://lorance.freeshell.org/csv/
function parse_csv(string,csv,sep,quote,escape,newline,trim, fields,pos,strtrim) {
    # Make sure there is something to parse.
    if (length(string) == 0) return 0;
    string = sep string; # The code below assumes ,FIELD.
    fields = 0; # The number of fields found thus far.
    while (length(string) > 0) {
        # Remove spaces after the separator if requested.
        if (trim && substr(string, 2, 1) == " ") {
            if (length(string) == 1) return fields;
            string = substr(string, 2);
            continue;
        }
        strtrim = 0; # Used to trim quotes off strings.
        # Handle a quoted field.
        if (substr(string, 2, 1) == quote) {
            pos = 2;
            do {
                pos++
                if (pos != length(string) &&
                    substr(string, pos, 1) == escape &&
                    (substr(string, pos + 1, 1) == quote ||
                     substr(string, pos + 1, 1) == escape)) {
                    # Remove escaped quote characters.
                    string = substr(string, 1, pos - 1) substr(string, pos + 1);
                } else if (substr(string, pos, 1) == quote) {
                    # Found the end of the string.
                    strtrim = 1;
                } else if (newline && pos >= length(string)) {
                    # Handle embedded newlines if requested.
                    if (getline == -1) {
                        csverr = "Unable to read the next line.";
                        return -1;
                    }
                    string = string newline $0;
                }
            } while (pos < length(string) && strtrim == 0)
            if (strtrim == 0) {
                csverr = "Missing end quote.";
                return -2;
            }
        } else {
            # Handle an empty field.
            if (length(string) == 1 || substr(string, 2, 1) == sep) {
                csv[fields] = "";
                fields++;
                if (length(string) == 1)
                    return fields;
                string = substr(string, 2);
                continue;
            }
            # Search for a separator.
            pos = index(substr(string, 2), sep);
            # If there is no separator the rest of the string is a field.
            if (pos == 0) {
                csv[fields] = substr(string, 2);
                fields++;
                return fields;
            }
        }
        # Remove spaces after the separator if requested.
        if (trim && pos != length(string) && substr(string, pos + strtrim, 1) == " ") {
            trim = strtrim
            # Count the number fo spaces found.
            while (pos < length(string) && substr(string, pos + trim, 1) == " ") {
                trim++
            }
            # Remove them from the string.
            string = substr(string, 1, pos + strtrim - 1) substr(string,  pos + trim);
            # Adjust pos with the trimmed spaces if a quotes string was not found.
            if (!strtrim) {
                pos -= trim;
            }
        }
        # Make sure we are at the end of the string or there is a separator.
        if ((pos != length(string) && substr(string, pos + 1, 1) != sep)) {
            csverr = "Missing separator.";
            return -3;
        }
        # Gather the field.
        csv[fields] = substr(string, 2 + strtrim, pos - (1 + strtrim * 2));
        fields++;
        # Remove the field from the string for the next pass.
        string = substr(string, pos + 1);
    }
    return fields;
}
# get this function from dictgen.php (stardict-tools)
function stardict_nbo(string) {
	s1=and(string,255);
	string=rshift(string,8);
	s2=and(string,255);
	string=rshift(string,8);
	s3=and(string,255);
	string=rshift(string,8);
	s4=and(string,255);
	string=rshift(string,8);
	return sprintf("%c", s4) sprintf("%c", s3) sprintf("%c", s2) sprintf("%c", s1);
}
BEGIN {	
	ORS=""; 
	ex=0;
	if (dictname == "" ){
		print "Please specify dictionary name\n";
		ex=1;
		exit 1;
	}
	if ( ( system("[ -a " dictname ".ifo ] || [ -a " dictname ".idx ] || [ -a " dictname ".dict ]") ) == 0) {
		print "Some of destination files already exist\n";
		ex=1;
		exit 1;
	}
}
{
	if ( parse_csv($0, csv, ",", "\"", "\"", "\\n", 1) == 4) {
		csv[2]=tolower(csv[2]);
		if (dict[csv[2]])
			dict[csv[2]]=dict[csv[2]]", ";
		dict[csv[2]]=dict[csv[2]]csv[3];
	}
}
END { 
	if (ex==1) {exit 1;}
	cnt=asorti(dict,orig);
	if ( cnt == 0) {
		print "Empty dictionary\n";
		exit 1;
	}
	pos=0;
	for (z=1;z<=cnt;++z)
	{
		len=length(dict[orig[z]]); 
		print orig[z] sprintf("%c", 0) stardict_nbo(pos) stardict_nbo(len) >dictname".idx";
		print dict[orig[z]] >dictname".dict"; 
		pos=pos+len;
	}
	close(dictname".idx");
	com= "stat --printf=\"%s\" " dictname ".idx";
	com| getline size;
	print "StarDict's dict ifo file\nversion=2.4.2\nwordcount="cnt"\nidxfilesize="size"\nbookname="dictname"\nsametypesequence=m\n">dictname".ifo";
}