mail_to_db.pl
上传用户:dzyhzl
上传日期:2019-04-29
资源大小:56270k
文件大小:8k
- #!/usr/bin/perl
- # Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
- # This file is public domain and comes with NO WARRANTY of any kind
- #
- # This program is brought to you by Janne-Petteri Koilo with the
- # administration of Michael Widenius.
- # This program takes your mails and puts them into your database. It ignores
- # messages with the same from, date and message text.
- # You can use mail-files that are compressed or gzipped and ends with
- # -.gz or -.Z.
- use DBI;
- use Getopt::Long;
- $VER = "1.5";
- $opt_db = "test";
- $opt_table = "mails";
- $opt_max_mail_size = 65536;
- $opt_db_engine = "mysql";
- $opt_host = "localhost";
- $opt_user = $opt_password = "";
- $opt_help = $opt_version = 0;
- GetOptions("help","version","user=s","password=s",
- "db_engine=s","db=s","host=s","max_mail_size=s") || usage();
- usage($VER) if ($opt_help || $opt_version || !$ARGV[0]);
- %months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
- 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
- 'Nov' => 11, 'Des' => 12);
- $count_no_from = $count_no_txt = $count_too_big = 0;
- $count_forwarded_msgs = $count_duplicates = $no_subject = 0;
- $inserted_mails = 0;
- $dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user,
- $opt_password,{ PrintError => 0}) || die $DBI::errstr;
- create_table_if_needed($dbh);
- foreach (@ARGV)
- {
- if (/^(.*).(gz|Z)$/) #checks if the file is compressed or gzipped
- {
- open(FILE, "zcat $_ |");
- process_mail_file($dbh,$1);
- }
- else
- {
- open(FILE,$_);
- process_mail_file($dbh,$_);
- }
- }
- $ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject;
- print "Mails inserted:ttt$inserted_mailsn";
- print "Mails ignored:ttt$ignoredn";
- print "Mails without "From:" -field:t$count_no_fromn";
- print "Mails without message:tt$count_no_txtn";
- print "Too big mails (> $opt_max_mail_size):t$count_too_bign";
- print "Duplicate mails:tt$count_duplicatesn";
- print "Forwarded mails:tt$count_forwarded_msgsn";
- print "Mails altogether:tt";
- print $inserted_mails+$ignored;
- print "n";
- exit(0);
- sub usage
- {
- my($VER)=@_;
-
- $0 =~ s/./(.+)/$1/;
- if ($opt_version)
- {
- print "$0 version $VERn";
- }
- else
- {
- print <<EOF;
- $0 version $VER
- Usage: $0 [options] file1 [file2 file3 ...]
- Description: Inserts mails from file(s) into a database
- Options:
- --help show this help and exit
- --version shows the version of the program
- --db_engine=... database server (default: $opt_db_engine)
- --db=... database to be used (default: $opt_db)
- --host=... hostname to be used (default: $opt_host)
- --password=... user password for the db server
- --user=... username for the db server
- --max_mail_size=# max size of a mail to be inserted into the db.
- mail will be ignored if it exceeds this size
- (default $opt_max_mail_size)
- EOF
- }
- exit(0);
- }
- sub create_table_if_needed
- {
- my ($dbh)=@_;
- my ($sth,$create);
-
- $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr;
- if (!$sth->execute)
- {
- $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null ";
- $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) ";
- $create .= "NOT NULL, mail_from char(120) not null, reply char(120), ";
- $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT ";
- $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key ";
- $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))";
- $sth = $dbh->prepare($create) or die $dbh->errstr;
- $sth->execute() or die $dbh->errstr;
- }
- }
- sub process_mail_file
- {
- my ($dbh,$file_name)= @_;
- my (%values,$type,$check);
- %values=(); $type="";
- $check=0;
- while (<FILE>)
- {
- chop;
- if ($type ne "message")
- {
- if (/^Reply-To: (.*)/i) # finding different fields from file
- {
- $type="reply";
- $values{$type}= $1;
- }
- elsif (/^From: (.*)/i)
- {
- $type="from";
- $values{$type}= $1;
- }
- elsif (/^To: (.*)/i)
- {
- $type="to";
- $values{$type}= $1;
- }
- elsif (/^Cc: (.*)/i)
- {
- $type="cc";
- $values{$type}= $1;
- }
- elsif (/^Subject: (.*)/i)
- {
- $type="subject";
- $values{$type}= $1;
- }
- elsif (/^Date: (.*)/i)
- {
- date_parser($1,%values);
- $type="rubbish";
- }
- elsif (/^[wW-]+:s/)
- {
- $type="rubbish";
- }
- elsif ($_ eq "")
- {
- $type="message";
- $values{$type}="";
- }
- else
- {
- s/^s*/ /;
- $values{$type}.= $_;
- }
- }
- elsif ($check!=0 && $_ ne "") # in case of forwarded messages
- {
- $values{$type}.= "n" . $_;
- $check--;
- }
- elsif (/^From .* dd:dd:ddsdddd$/)
- {
- $values{'hash'}= checksum("$values{'message'}");
- update_table($dbh,$file_name,%values);
- %values=(); $type="";
- $check=0;
- }
- elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
- {
- $values{$type}.= "n" . $_;
- $check++;
- $count_forwarded_msgs++;
- }
- else
- {
- $values{$type}.= "n" . $_;
- }
- }
- $values{'hash'}= checksum("$values{'message'}");
- update_table($dbh,$file_name,%values);
- }
- ########
- # converts date to the right form
- sub date_parser
- {
- my ($date_raw,$values)=@_;
- $date_raw =~ /.*?(d{1,2}?) (.+?) (d{2,4}?) (dd:dd:dd?) ([w-+]{3,5})/;
- $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4";
- $values->{'time_zone'}=$5;
- }
- #########
- # this is runned when the whole mail is gathered.
- # this actually puts the mail to the database.
- sub update_table
- {
- my($dbh,$file_name,$values)=@_;
- my($query);
- if (! defined($values->{'subject'}) || !defined($values->{'to'}) || !defined)
- {
- $no_subject++;
- return; # Ignore these
- }
- $values->{'message'} =~ s/^s*//; #removes whitespaces from the beginning
- $values->{'message'} =~ s/s*$//; #removes whitespaces from the end
- $query = "insert into $opt_table values ('" . "NULL','" . $values->{'date'};
- $query .= "','" . $values->{'time_zone'} . "',";
- $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ",";
- $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ",";
- $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ",";
- $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ",";
- $query .= $dbh->quote($values->{'subject'}) . ",";
- $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name);
- $query .= ",'" . $values->{'hash'} . "')";
-
- if (length($values->{'message'}) > $opt_max_mail_size) #disables big message
- {
- $count_too_big++;
- }
- elsif ($values->{'from'} eq "") #disables mails with no from field
- {
- $count_no_from++;
- }
- elsif ($values->{'message'} eq "") #disables mails with no message text
- {
- $count_no_msg_text++;
- }
- elsif ($dbh->do($query))
- {
- $inserted_mails++;
- }
- elsif (!($dbh->errstr =~ /^Duplicate entry /)) #disables duplicates
- {
- print "Aborting: Got error '$dbh->errstr' for query: '$query'n";
- }
- else
- {
- $count_duplicates++;
- }
- $query="";
- }
- ##########
- # In case you have two identical messages we wanted to identify them
- # and remove additionals; We do this by calculating a hash number of the
- # message and ignoring messages with the same from, date and hash.
- # This function calculates a simple 32 bit hash value for the message.
- sub checksum
- {
- my ($txt)= @_;
- my ($crc,$i,$count);
- $count = length($txt);
- for ($crc = $i = 0; $i < $count ; $i++)
- {
- $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
- (($crc & (1 << 30)) ? 1 : 0);
- $crc &= ((1 << 31) -1);
- }
- return $crc;
- }