Blame view

tools/scripts/UrlConverter.pl 4.85 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
  #!/usr/bin/perl
  while(<STDIN>){
  print url($_);
  }
  
  sub url {
     my $entree = shift;
     my @sortie;
  
     my @tab_lignes = split(/
  /,$entree);
     foreach my $text (@tab_lignes) {
         my @nv_ligne = ();
         # trime les blancs
         $text =~ s/\s+/ /g;
         if (($text =~ /http/) || ($text =~ /www/) || ($text =~ /ftp:/)  || ($text =~ /\.html/) || ($text =~ /\.htm/) || ($text =~ /\@/)|| ($text =~ /\.org/) || ($text =~ /\.net/) || ($text =~ /\.com/) || ($text =~ /\.fr/) || ($text =~ /\.uk/) || ($text =~ /\.be/)) {
             $text =~ s/http\s*:\s*\/\/\s*/ http:\/\//g;
             $text =~ s/http\s*:\/\/\s*/ http:\/\//g;
             $text =~ s/htt\s*:\s*\/\/\s*/ http:\/\//g;
             $text =~ s/http:\/\s+/http:\/\//g;
             $text =~ s/\.www\./\. www\./g;
             $text =~ s/www\.\s+/www\./g;
             $text =~ s/www\s+\./www\./g;
             $text =~ s/ftp\.\s+/ftp\./g;
             $text =~ s/WWW\.\s+/www\./g;
             $text =~ s/\s+\//\//g;
             $text =~ s/\s*~\s*/~/g;
             $text =~ s/\/\s+/\//g;
             $text =~ s/\.\s+com/\.com/g;
             $text =~ s/\.\s+fr/\.fr/g;
             $text =~ s/\.\s+org/\.org/g;
             $text =~ s/\.\s+doc/\.doc/g;
             $text =~ s/\.\s+ppt/\.ppt/g;
             $text =~ s/\.\s+jpg/\.jpg/g;
             $text =~ s/\.\s+pdf/\.pdf/g;
             $text =~ s/\s+\.org/\.org/g;
             $text =~ s/\.\s+gov/\.gov/g;
             $text =~ s/\.\s+gouv\.\s+fr/\.gouv\.fr/g;
             $text =~ s/\s+\.gouv\.\s+fr/\.gouv\.fr/g;
             $text =~ s/\.\s+co\./\.co \./g;
             $text =~ s/\.\s+uk/\.uk/g;
             $text =~ s/\.\s+asso\.\s+fr/\.asso\.fr/g;
             $text =~ s/\.\s+qc/\.qc/g;
             $text =~ s/\.\s+de/\.de/g;
             $text =~ s/\.\s+uk/\.uk/g;
             $text =~ s/\.\s+ca/\.ca/g;
             $text =~ s/\.\s+ch/\.ch/g;
             $text =~ s/\.\s+net/\.net/g;
             $text =~ s/\.\s+th/\.th/g;
             $text =~ s/\.\s+nasa/\.nasa/g;
             $text =~ s/\.\s+ibm\.com/\.ibm\.com/g;
             $text =~ s/\.\s+club\-internet/\.club\-internet/g;
             $text =~ s/\.\s+yahoo/\.yahoo/g;
             $text =~ s/\.\s+oleane/\.oleane/g;
             $text =~ s/\.\s+html/\.html/g;
             $text =~ s/\.\s+asp/\.asp/g;
             $text =~ s/\.\s+php/\.php/g;
             $text =~ s/\.\s+htm/\.htm /g;
             $text =~ s/\s+\.html/\.html/g;
             $text =~ s/\s+\.htm/\.htm /g;
             $text =~ s/\.\s+HTM/\.htm /g;
             $text =~ s/\s+\[at\]\s+/\@/g;
             $text =~ s/http:\/\/([0-9]*[0-9]) ([0-9]*[0-9]) ([0-9]*[0-9]) ([0-9]*[0-9])/http:\/\/\1\.\2\.\3\.\4/g;
             $text =~ s/www\.([a-zA-Z]+)\- ([a-zA-Z]+)/www\.\1\-\2/g;
  
             $text =~ s/\.\s+com\./\.com \./g;
             $text =~ s/\.\s+fr\./\.fr \./g;
             $text =~ s/\.\s+org\./\.org \./g;
             $text =~ s/\.\s+pdf\./\.pdf \./g;
             $text =~ s/\.\s+de/\.de/g;
             $text =~ s/\s+\.org\./\.org \./g;
             $text =~ s/\.\s+gov\./\.gov \./g;
             $text =~ s/\.\s+uk\./\.uk \./g;
             $text =~ s/\.\s+qc\./\.qc \./g;
             $text =~ s/\.\s+ca\./\.ca \./g;
             $text =~ s/\.\s+ch\./\.ch \./g;
             $text =~ s/\.\s+net\./\.net \./g;
             $text =~ s/\.\s+th\./\.th \./g;
             $text =~ s/\.\s+asp\./\.asp \./g;
             $text =~ s/\.\s+php\./\.php \./g;
             $text =~ s/\.\s+htm\./\.htm \./g;
             $text =~ s/\s+\.html\./\.html \./g;
             $text =~ s/\s+\.htm\./\.htm \./g;
             $text =~ s/\.\s+HTM\./\.htm \./g;
             $text =~ s/\s+/ /g;
             my @line = split(/\s+/, $text);
             for(my $i = 0; $i < scalar(@line); $i++) {
                 if (($line[$i] =~ /http/) || ($line[$i] =~ /www/)|| ($line[$i] =~ /ftp/)|| ($line[$i] =~ /@/) || ($line[$i] =~ /\.org/) || ($line[$i] =~ /\.net/) || ($line[$i] =~ /\.com/) || ($line[$i] =~ /\.fr/) || ($line[$i] =~ /\.uk/) || ($line[$i] =~ /\.gov/) || ($line[$i] =~ /\.pdf/) || ($line[$i] =~ /\.php3/) || ($line[$i] =~ /\.co/)
                 || ($line[$i] =~ /\.be/)|| ($line[$i] =~ /\.de/)|| ($line[$i] =~ /\.doc/) || ($line[$i] =~ /\.ppt/)) {
                     $line[$i] =~ s/wwww/ www /g;
                     $line[$i] =~ s/\./ point /g;
                     $line[$i] =~ s/\// slash /g;
                     $line[$i] =~ s/:/ deux points /g;
                     $line[$i] =~ s/-/ tiret /g;
                     $line[$i] =~ s/_/ tiret bas /g;
                     $line[$i] =~ s/~/ tilde /g;
                     $line[$i] =~ s/@/ arobase /g;
  		   $line[$i] =~ s/#/ die2se /g;
                     $line[$i] =~ s/\?/ point d' interrogation /g;
  		   $line[$i] =~ s/=/ e1gale /g;
                     $line[$i] =~ s/([0-9]+)/ \1 /g;
                     push(@nv_ligne, $line[$i]);
                 } else {
                     push(@nv_ligne, $line[$i]);
                 }
             }
             push(@sortie, join(" ", @nv_ligne));
         }
         else {
             push(@sortie, "$text");
         }
     }
     return (join("
  ", @sortie))."
  ";
  }