UrlConverter.pl 4.85 KB
#!/usr/bin/perl
while(<STDIN>){
print url($_);
}

sub url {
   my $entree = shift;
   my @sortie;

   my @tab_lignes = split(/\n/,$entree);
   foreach my $text (@tab_lignes) {
       my @nv_ligne = ();
       # trime les blancs
       $text =~ s/\s+/ /g;
       if (($text =~ /http/) || ($text =~ /www/) || ($text =~ /ftp:/)  || ($text =~ /\.html/) || ($text =~ /\.htm/) || ($text =~ /\@/)|| ($text =~ /\.org/) || ($text =~ /\.net/) || ($text =~ /\.com/) || ($text =~ /\.fr/) || ($text =~ /\.uk/) || ($text =~ /\.be/)) {
           $text =~ s/http\s*:\s*\/\/\s*/ http:\/\//g;
           $text =~ s/http\s*:\/\/\s*/ http:\/\//g;
           $text =~ s/htt\s*:\s*\/\/\s*/ http:\/\//g;
           $text =~ s/http:\/\s+/http:\/\//g;
           $text =~ s/\.www\./\. www\./g;
           $text =~ s/www\.\s+/www\./g;
           $text =~ s/www\s+\./www\./g;
           $text =~ s/ftp\.\s+/ftp\./g;
           $text =~ s/WWW\.\s+/www\./g;
           $text =~ s/\s+\//\//g;
           $text =~ s/\s*~\s*/~/g;
           $text =~ s/\/\s+/\//g;
           $text =~ s/\.\s+com/\.com/g;
           $text =~ s/\.\s+fr/\.fr/g;
           $text =~ s/\.\s+org/\.org/g;
           $text =~ s/\.\s+doc/\.doc/g;
           $text =~ s/\.\s+ppt/\.ppt/g;
           $text =~ s/\.\s+jpg/\.jpg/g;
           $text =~ s/\.\s+pdf/\.pdf/g;
           $text =~ s/\s+\.org/\.org/g;
           $text =~ s/\.\s+gov/\.gov/g;
           $text =~ s/\.\s+gouv\.\s+fr/\.gouv\.fr/g;
           $text =~ s/\s+\.gouv\.\s+fr/\.gouv\.fr/g;
           $text =~ s/\.\s+co\./\.co \./g;
           $text =~ s/\.\s+uk/\.uk/g;
           $text =~ s/\.\s+asso\.\s+fr/\.asso\.fr/g;
           $text =~ s/\.\s+qc/\.qc/g;
           $text =~ s/\.\s+de/\.de/g;
           $text =~ s/\.\s+uk/\.uk/g;
           $text =~ s/\.\s+ca/\.ca/g;
           $text =~ s/\.\s+ch/\.ch/g;
           $text =~ s/\.\s+net/\.net/g;
           $text =~ s/\.\s+th/\.th/g;
           $text =~ s/\.\s+nasa/\.nasa/g;
           $text =~ s/\.\s+ibm\.com/\.ibm\.com/g;
           $text =~ s/\.\s+club\-internet/\.club\-internet/g;
           $text =~ s/\.\s+yahoo/\.yahoo/g;
           $text =~ s/\.\s+oleane/\.oleane/g;
           $text =~ s/\.\s+html/\.html/g;
           $text =~ s/\.\s+asp/\.asp/g;
           $text =~ s/\.\s+php/\.php/g;
           $text =~ s/\.\s+htm/\.htm /g;
           $text =~ s/\s+\.html/\.html/g;
           $text =~ s/\s+\.htm/\.htm /g;
           $text =~ s/\.\s+HTM/\.htm /g;
           $text =~ s/\s+\[at\]\s+/\@/g;
           $text =~ s/http:\/\/([0-9]*[0-9]) ([0-9]*[0-9]) ([0-9]*[0-9]) ([0-9]*[0-9])/http:\/\/\1\.\2\.\3\.\4/g;
           $text =~ s/www\.([a-zA-Z]+)\- ([a-zA-Z]+)/www\.\1\-\2/g;

           $text =~ s/\.\s+com\./\.com \./g;
           $text =~ s/\.\s+fr\./\.fr \./g;
           $text =~ s/\.\s+org\./\.org \./g;
           $text =~ s/\.\s+pdf\./\.pdf \./g;
           $text =~ s/\.\s+de/\.de/g;
           $text =~ s/\s+\.org\./\.org \./g;
           $text =~ s/\.\s+gov\./\.gov \./g;
           $text =~ s/\.\s+uk\./\.uk \./g;
           $text =~ s/\.\s+qc\./\.qc \./g;
           $text =~ s/\.\s+ca\./\.ca \./g;
           $text =~ s/\.\s+ch\./\.ch \./g;
           $text =~ s/\.\s+net\./\.net \./g;
           $text =~ s/\.\s+th\./\.th \./g;
           $text =~ s/\.\s+asp\./\.asp \./g;
           $text =~ s/\.\s+php\./\.php \./g;
           $text =~ s/\.\s+htm\./\.htm \./g;
           $text =~ s/\s+\.html\./\.html \./g;
           $text =~ s/\s+\.htm\./\.htm \./g;
           $text =~ s/\.\s+HTM\./\.htm \./g;
           $text =~ s/\s+/ /g;
           my @line = split(/\s+/, $text);
           for(my $i = 0; $i < scalar(@line); $i++) {
               if (($line[$i] =~ /http/) || ($line[$i] =~ /www/)|| ($line[$i] =~ /ftp/)|| ($line[$i] =~ /@/) || ($line[$i] =~ /\.org/) || ($line[$i] =~ /\.net/) || ($line[$i] =~ /\.com/) || ($line[$i] =~ /\.fr/) || ($line[$i] =~ /\.uk/) || ($line[$i] =~ /\.gov/) || ($line[$i] =~ /\.pdf/) || ($line[$i] =~ /\.php3/) || ($line[$i] =~ /\.co/)
               || ($line[$i] =~ /\.be/)|| ($line[$i] =~ /\.de/)|| ($line[$i] =~ /\.doc/) || ($line[$i] =~ /\.ppt/)) {
                   $line[$i] =~ s/wwww/ www /g;
                   $line[$i] =~ s/\./ point /g;
                   $line[$i] =~ s/\// slash /g;
                   $line[$i] =~ s/:/ deux points /g;
                   $line[$i] =~ s/-/ tiret /g;
                   $line[$i] =~ s/_/ tiret bas /g;
                   $line[$i] =~ s/~/ tilde /g;
                   $line[$i] =~ s/@/ arobase /g;
		   $line[$i] =~ s/#/ die2se /g;
                   $line[$i] =~ s/\?/ point d' interrogation /g;
		   $line[$i] =~ s/=/ e1gale /g;
                   $line[$i] =~ s/([0-9]+)/ \1 /g;
                   push(@nv_ligne, $line[$i]);
               } else {
                   push(@nv_ligne, $line[$i]);
               }
           }
           push(@sortie, join(" ", @nv_ligne));
       }
       else {
           push(@sortie, "$text");
       }
   }
   return (join("\n", @sortie))."\n";
}