#!/usr/bin/perl

################################################################################
#                               Word Extractor                                 #
################################################################################
# Version : 0.1.1                                                              #
# License : GPL (General Public License)                                       #
# Author  : Valery Dachev                                                      #
# E-Mail  : valery@zonebg.com                                                  #
# Homepage: http://valery.zonebg.com                                           #
################################################################################

# The " " is considered and hardcoded to be a separator

@Separators = ("\n", "\t", ",", ".", "!", "?", "(", ")", "[", "]");

sub ProcessFile {

    #### Lowercase all the text

    $Text = lc $Text;

    #### Example: Clean up any HTML and PHP3 headers and tags

    $Text =~ s/&([^;])*;/ /g;
    $Text =~ s/<\?([^\?>])*\?>/ /msg;
    $Text =~ s/<SCRIPT.*<\/SCRIPT>/ /msg;
    $Text =~ s/<!--([^-->])*-->/ /msg;
    $Text =~ s/<([^>]|\n)*>/ /g;

    foreach $Separator (@Separators) {
	$Separator =~ s/\\/\\\\/g;
	$Separator =~ s/\//\\\//g;
	$Separator =~ s/\+/\\\+/g;
	$Separator =~ s/\$/\\\$/g;
	$Separator =~ s/\*/\\\*/g;
	$Separator =~ s/\./\\\./g;
	$Separator =~ s/\?/\\\?/g;
	$Separator =~ s/\^/\\\^/g;
	$Separator =~ s/\!/\\\!/g;
	$Separator =~ s/\(/\\\(/g;
	$Separator =~ s/\)/\\\)/g;
	$Separator =~ s/\[/\\\[/g;
	$Separator =~ s/\]/\\\]/g;
	$Text =~ s/$Separator/ /g;
    }

    @Words = split (' ', $Text);
    @Words = sort @Words;

    for ($Counter=0; $Counter<@Words; $Counter++) {
	if ($Counter == 0 || $Words[$Counter] ne $Words[$Counter-1]) {
	    print "$Words[$Counter]\n";
	}
    }
}

if ($#ARGV==-1 || $ARGV[1] eq "--help" || $ARGV[1] eq "-h") {
    print "I can\'t do anything if you don\'t tell me what to ...\n";
    exit 0;
}

foreach $file (@ARGV) {
    open (FILE, "$file");
    @Text = <FILE>;
    $Text = join (' ', @Text);
    close(SEARCH);
    ProcessFile();
}

