#!/usr/local/bin/perl # converter from my diary.shtml to blosxom # Copyright (C) 2004 Kengo Ichiki # $Id: diary2blosxom.pl,v 1.11 2004/01/28 03:13:52 ichiki Exp $ use HTML::Parser; use Jcode; $topurl = "http://kichiki.com/cgi-bin/blosxom.cgi/diary"; $flavour = "html"; @list0 = (); # old blosxom entry files # check for command-line argument die "Usage: diary2blosxom.pl (HTML file)\n" unless @ARGV == 1; # get the command-line argument my $file = shift; # Create HTML Parser object my $p = HTML::Parser->new (api_version => 3, start_h => [\&start, "tagname,attr"], text_h => [\&text, "dtext"], end_h => [\&end, "tagname"], ); die "File $file does't exist.\n" unless -e $file; open (HTML, "< $file") or die "Cannot open $file"; $h1flag = 0; # h1 tag $h2flag = 0; # h2 tag $preflag = 0; # pre tag $nsaflag = 0; # not-shown anchor tag flag -- local copy, no-href tags $ullevel = 0; # ul level $year = 0; $month = 0; $day = 0; $num = 1; $title = ""; $body = ""; $subn = 0; $subday = 0; $subname = ""; @subfile; @subtitle; @subbody; @subullevel; while () { # convert input-line into utf8 Jcode::convert ( \$_, "utf8"); $p->parse($_); } close (HTML); foreach $file (@list0) { print "removed: $file\n"; unlink ($file); } exit; sub start { my ($tagname, $attr) = @_; my $i; my $a, $c, $tmptag; # h1 tag if ($tagname eq "h1") { $h1flag = 1; } # h2 tag elsif ($tagname eq "h2") { $h2flag = 1; } # a tag elsif ($tagname eq "a") { if ($ullevel == 0 && $h2flag == 1) { $_ = $attr->{id}; if (/d(\d+)/) { $day = $1; $num = 1; # reset counter } } if ($ullevel >= 1) { if ($attr->{href} eq "") { # do nothing $nsaflag = 1; } else { $_ = $attr->{href}; $_ =~ s/^images\/AMAZON/http:\/\/images.amazon.com\/images\/P/; $_ =~ s/^theorems.shtml/http:\/\/kichiki.hp.infoseek.co.jp\/diary\/theorems.shtml/; if (/^\#(.+)/) { $_ = sprintf ("diary%4d_%02d.shtml\#%s", $year, $month, $1); } if (/^diary(\d{4})\_(\d{2}).shtml\#d(\d+)-(.+)/) { $_ = sprintf ("$topurl/%4d%02d%02d-%s.$flavour", $1, $2, $3, $4); } elsif (/^diary(\d{4})\_(\d{2}).shtml\#d(\d+)/) { $_ = "$topurl/$1/$2/$3"; } elsif (/^diary(\d{4})\_(\d{2}).shtml/) { $_ = "$topurl/$1/$2"; } if ($_ eq "") { # do nothing $nsaflag = 1; } elsif (/^LOCAL\//) { # do nothing $nsaflag = 1; } else { $nsaflag = 0; # to show (for sure) if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } else { $body .= ""; } } else { if ($subullevel[$subn-1] == $ullevel) { $subtitle[$subn-1] .= ""; } else { $subbody[$subn-1] .= ""; } } } } if ($attr->{id} ne "") { $_ = $attr->{id}; if (/d(\d+)-(.+)/) { $subn ++; $subday = $1; $subname = $2; if ($subday != $day) { print "WRONG!!\n"; } $subfile [$subn-1] = sprintf ("%4d%02d%02d-%s", $year, $month, $day, $subname); $subullevel [$subn-1] = $ullevel; $subtitle [$subn-1] = ""; $subbody [$subn-1] = ""; } } } } # ul tag elsif ($tagname eq "ul") { if ($ullevel >= 1) { if ($subn == 0) { $body .= "\n"; for ($i = 1; $i < $ullevel; $i ++) { $body .= " "; } $body .= "\n"; } } # a tag elsif ($tagname eq "a") { if ($ullevel >= 1 && $nsaflag == 0) { if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } elsif ($ullevel > 1) { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } elsif ($ullevel > $subullevel[$subn-1]) { $subbody[$subn-1] .= ""; } } } else { $nsaflag = 0; } } # li tag elsif ($tagname eq "li") { if ($subn == 0) { if ($ullevel == 1) { $blosxomfile = sprintf ("%4d%02d%02d%02d", $year, $month, $day, $num); # end of the entry print_entry ($blosxomfile, $title, $body, $year, $month, $day, $num); # for sure $num ++; $title = ""; $body = ""; } elsif ($ullevel > 1) { $body .= "\n"; for ($i = 1; $i < $ullevel; $i ++) { $body .= " "; } $body .= "\n"; } } else { if ($ullevel == $subullevel[$subn-1]) { # end of the entry print_entry ($subfile[$subn-1], $subtitle[$subn-1], $subbody[$subn-1], $year, $month, $day, $num); # make a link # categorize if ($t =~ /\Q[\E.+\Q]\E$/) { $category = "phys"; } else { $category = ""; } # remove anchor tag in the tiltke $localtitle = $subtitle[$subn-1]; $localtitle =~ s///g; $localtitle =~ s/<\/a>//g; if ($subn == 1) { if ($category eq "") { $body .= "$localtitle"; } else { $body .= "$localtitle"; } } else { if ($category eq "") { $subbody [$subn - 2] .= "$localtitle"; } else { $subbody [$subn - 2] .= "$localtitle"; } } # for sure $num ++; $subfile [$subn-1] = ""; $subtitle [$subn-1] = ""; $subbody [$subn-1] = ""; $subullevel[$subn-1] = 0; $subn --; } else { $subbody[$subn-1] .= "\n"; for ($i = 1; $i < ($ullevel-$subullevel[$sun-1]); $i ++) { $subbody[$subn-1] .= " "; } $subbody[$subn-1] .= "\n"; } } } # pre tag elsif ($tagname eq "pre") { $preflag = 0; if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } elsif ($ullevel > 1) { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } elsif ($ullevel > $subullevel[$subn-1]) { $subbody[$subn-1] .= ""; } } } # other tags else { if ($ullevel >= 1) { if ($subn == 0) { if ($ullevel == 1) { $title .= ""; } else { $body .= ""; } } else { if ($ullevel == $subullevel[$subn-1]) { $subtitle[$subn-1] .= ""; } else { $subbody[$subn-1] .= ""; } } } } } sub print_entry { my ($f, # file $t, # title $b, # body $y, # year (4 digits) $m, # month (1 - 12) $d, # day (1 - 31) $n # number (1 - ) ) = @_; my $category; my @txtmon = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "13"); my $filename = ''; $t =~ s/\n//g; $t =~ s/^ +//g; $t =~ s/ +$//g; $t =~ s/ / /g; $t =~ s/\Q()\E//g; if ($t eq "") { $t = "-"; } $b =~ s/\Q()\E//g; # categorize if ($t =~ /\Q[\E.+\Q]\E$/) { $category = "phys"; } else { $category = ""; } if ($category ne "") { unless (-e $category) { mkdir "$category", 0755 or die "Cannot make dir $category\n"; } $filename = "./$category/$f.txt"; } else { $filename = "./$f.txt"; } open (FH, "> tmp.txt"); print (FH "$t\n"); printf (FH "meta-creation_date: %s %d, %d 00:%02d\n", $txtmon[int($m-1)], $d, $y, $n); print (FH "$b\n"); close (FH); my $flag_match = 0; my $count = 0; foreach my $file (@list0) { if ($file eq $filename) { $flag_match = 1; if (!compare_files ("tmp.txt", $file)) { # modified print ("updated: $filename\n"); rename ("tmp.txt", $filename); } else { # same unlink ("tmp.txt"); } # remove the file from @list0 splice (@list0, $count, 1); return; } $count ++; } if ($flag_match == 0) { # new entry print ("new: $filename\n"); rename ("tmp.txt", $filename); } } # "getfiles" is borrowed from # Blosxom Plugin: recentwritebacks_tree # Author(s): typester # Version: 1.0 # Blosxom Home/Docs/Licensing: http://www.blosxom.com/ sub getfiles { my $dir = $_[0]; my @files = (); my @ret = (); $dir .= "/" if ($dir =~ /[^\/]$/); if (opendir (DIR, $dir)) { @files = readdir (DIR); closedir (DIR); } foreach my $file (@files) { next if ($file eq '.' or $file eq '..'); if (-d "$dir$file" and $file ne '') { my @subdir = &getfiles("$dir$file"); @ret = (@ret, @subdir); } else { push (@ret, "$dir$file"); } } return @ret; } sub getentries { my ($dir, $year, $month) = @_; my @ret = (); my @files = getfiles ($dir); my $str_ym = sprintf ("%4d%02d", $year, $month); foreach my $file (@files) { if ($file =~ /$str_ym.+\.txt/) { push (@ret, "$file"); } } return @ret; } # return true when two files are same sub compare_files { my ($f1, $f2) = @_; my $l1 = ''; my $l2 = ''; my $ret = 1; # true my $s1 = -s $f1; my $s2 = -s $f2; if ($s1 != $s2) { $ret = 0; # false } else { open (F1, "< $f1"); open (F2, "< $f2"); COMPARE: while ($l1 = and $l2 = ) { if ($l1 ne $l2) { $ret = 0; # false last COMPARE; } } close (F1); close (F2); } return $ret; }