# parse-unihan.pl

# mangle Unihan.txt on STDIN into tab seperated columns
# for feeding it into a RDBMS
# most populated columns come first

use strict;
use diagnostics;
use Data::Dumper;

my %tags;
{
	my $i	= 0;
	for (qw(
		kRSUnicode
		kIRGKangXi
		kRSKangXi
		kIRG_GSource
		kHanYu
		kIRGHanyuDaZidian
		kIRG_TSource
		kTotalStrokes
		kMandarin
		kIRG_KPSource
		kMorohashi
		kKangXi
		kDefinition
		kCantonese
		kCCCII
		kSBGY
		kKPS1
		kIRGDaiKanwaZiten
		kIRG_KSource
		kCangjie
		kCNS1992
		kCNS1986
		kDaeJaweon
		kIRGDaeJaweon
		kCihaiT
		kIRG_JSource
		kRSAdobe_Japan1_6
		kEACC
		kJapaneseOn
		kBigFive
		kPhonetic
		kJapaneseKun
		kIICore
		kXerox
		kIRG_VSource
		kKorean
		kTaiwanTelegraph
		kMatthews
		kVietnamese
		kGSR
		kMeyerWempe
		kMainlandTelegraph
		kGB1
		kGB0
		kJis0
		kFennIndex
		kJis1
		kNelson
		kFrequency
		kFenn
		kKSC0
		kGB3
		kHKGlyph
		kCowles
		kKPS0
		kIRG_HSource
		kHKSCS
		kTang
		kHanyuPinlu
		kJIS0213
		kLau
		kSemanticVariant
		kKSC1
		kGB5
		kSimplifiedVariant
		kTraditionalVariant
		kGradeLevel
		kZVariant
		kKarlgren
		kCompatibilityVariant
		kGB8
		kSpecializedSemanticVariant
		kIBMJapan
		kHDZRadBreak
		kRSJapanese
		kRSKanWa
		kPseudoGB1
		kGB7
		kIRG_USource
		kOtherNumeric
		kAccountingNumeric
		kRSKorean
		kPrimaryNumeric
	)) {
		$tags{$_}	= $i;
		$i++;
	};
	# $tags{kRSUnicode}	= 0; $tags{kIRGKangXi}	= 1; and so on
};

my %unihan;
while (<>) {
	next if /^#/;	# skip comments
	chomp;
	my ($codepoint, $tag, $content)	= split /\t/;
	$codepoint	=~ s/^U\+/0x/;		# replace U+ with 0x
	$codepoint	= hex $codepoint;	# treat 0x number as hex, convert to dec
	$unihan{$codepoint}[$tags{$tag}]	= $content;
};

foreach (keys %unihan) {
	$unihan{$_}[82]	= $unihan{$_}[82];
	# autovivify the last field for correct number of columns
	# else SQL COPY command throws a hissy fit

	my $s	= "$_\t";			# codepoint in dec + tab
	$s	.= join "\t", @{$unihan{$_}};	# append all content, tab separated
	$s	.= "\n";			# append final newline

	$s	=~ s/([\x{10000}-\x{1FFFFF}])/'\x{'.(sprintf '%X', ord $1).'}'/ge;
	# ork around: http://google.com/search?q=0x10000+site:postgresql.org+inurl:docs
	# replace U+10000 upwards with its perl escaped \x{HEXNUM} form

	print $s;
};