forked from Ensembl/ensembl-webcode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParseMaster.pm
207 lines (183 loc) · 7.16 KB
/
ParseMaster.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#ParseMaster (July 25 2005)
# Based on "ParseMaster.js" by Dean Edwards <http://dean.edwards.name/>
# Ported to Perl by Rob Seiler, ELR Software Pty Ltd <http://www.elr.com.au>
# Copyright 2005. License <http://creativecommons.org/licenses/LGPL/2.1/>
package ParseMaster;
use strict;
use Data::Dumper;
# Package wide variable declarations
use vars qw/$VERSION
@_X_escaped @_X_patterns
/;
$VERSION = '017';
# constants
my $X_EXPRESSION = 0;
my $X_REPLACEMENT = 1;
my $X_LENGTH = 2;
# re's used to determine nesting levels
my $X_GROUPS = qr/\(/o; # NB: Requires g modifier!
my $X_SUB_REPLACE = qr/\$\d/o;
my $X_INDEXED = qr/^\$\d+$/o;
my $XX_ESCAPE = qr/\\./o; # NB: Requires g modifier!
my $XX_DELETED = qr/\001[^\001]*\001/o; # NB: Requires g modifier!
my $DIGIT = qr/[^\D]/o; # Yep - this is a digit - contains no non-digits
# Constructor
sub new {
my $class = shift;
my $self = {};
@_X_escaped = (); # Re-initialize global for each instance
@_X_patterns = (); # Re-initialize global for each instance
# Instance variables - access by similarly named set/get functions
$self->{_ignoreCase_} = 0;
$self->{_escapeChar_} = '';
bless ($self, $class);
return $self;
}
sub ignoreCase {
my ($self, $value) = @_;
if (defined($value)) {
$self->{_ignoreCase_} = $value;
}
return $self->{_ignoreCase_};
}
sub escapeChar{
my ($self, $value) = @_;
if (defined($value)) {
$self->{_escapeChar_} = $value;
}
return $self->{_escapeChar_};
}
#######################
# Public Parsemaster functions
my $X_DELETE = sub(@$) {
my $X_offset = pop;
my @X_match = @_;
return (chr(001) . $X_match[$X_offset] . chr(001));
}; # NB semicolon required for closure!
# create and add a new pattern to the patterns collection
sub add {
my ($self, $expression, $X_replacement) = @_;
if (!$X_replacement) {$X_replacement = $X_DELETE};
# count the number of sub-expressions
my $temp = &_X_internalEscape($expression);
my $length = 1; # Always at least one because each pattern is itself a sub-expression
$length += $temp =~ s/$X_GROUPS//g; # One way to count the left capturing parentheses in the regexp string
# does the pattern deal with sub-expressions?
if ((ref($X_replacement) ne "CODE") && ($X_replacement =~ m/$X_SUB_REPLACE/)) {
if ($X_replacement =~ m/$X_INDEXED/) { # a simple lookup? (eg "$2")
# store the index (used for fast retrieval of matched strings)
$X_replacement = substr($X_replacement,1) - 1;
}
else { # a complicated lookup (eg "Hello $2 $1")
my $i = $length;
while ($i) { # Had difficulty getting Perl to do Dean's splitting and joining of strings containing $'s
my $str = '$a[$o+' . ($i-1) . ']'; # eg $a[$o+1]
$X_replacement =~ s/\$$i/$str/; # eg $2 $3 -> $a[$o+1] $a[$o+2]
$i--;
}
# build a function to do the lookup - returns interpolated string of array lookups
$X_replacement = eval('sub {my $o=pop; my @a=@_; return "' . $X_replacement . '"};');
}
}
else {}
# pass the modified arguments
&_X_add($expression || q/^$/, $X_replacement, $length);
}
# execute the global replacement
sub exec {
#print Dumper(@_X_patterns);
my ($self, $X_string) = @_;
my $escChar = $self->escapeChar();
my $ignoreCase = $self->ignoreCase();
my ($regexp,$captures) = &_getPatterns(); # Concatenated and parenthesized regexp eg '(regex1)|(regex2)|(regex3)' etc
$X_string = &_X_escape($X_string, $escChar);
if ($ignoreCase) {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/gie} # Pass $X_String as a
else {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/ge} # reference for speed
$X_string = &_X_unescape($X_string, $escChar);
$X_string =~ s/$XX_DELETED//g;
return $X_string;
}
sub _X_add {
push (@_X_patterns, [@_]); # Save each argument set as is into an array of arrays
}
# this is the global replace function (it's quite complicated)
sub _X_replacement {
my (@arguments) = @_;
#print Dumper (@arguments);
if ($arguments[0] le '') {return ''}
# Dereference last index (source String) here - faster than in _matchVars (maybe not needed at all?)
$arguments[$#arguments] = ${$arguments[$#arguments]};
my $i = 1;
# loop through the patterns
for (my $j=0; $j<scalar(@_X_patterns); $j++) { # Loop through global all @_X_patterns
my @X_pattern = @{$_X_patterns[$j]};
# do we have a result? NB: "if ($arguments[$i])" as in Dean's Javascript is false for the value 0!!!
if ((defined $arguments[$i]) && ($arguments[$i] gt '')) {
my $X_replacement = $X_pattern[$X_REPLACEMENT];
# switch on type of $replacement
if (ref($X_replacement) eq "CODE") { # function
return &$X_replacement(@arguments,$i);
}
elsif ($X_replacement =~ m/$DIGIT/) { # number (contains no non-digits)
return $arguments[$X_replacement + $i];
}
else { # default
return $X_replacement; # default
}
} # skip over references to sub-expressions
else {$i += $X_pattern[$X_LENGTH]}
}
}
#######################
# Private functions
#######################
# encode escaped characters
sub _X_escape {
my ($X_string, $X_escapeChar) = @_;
if ($X_escapeChar) {
my $re = '\\'.$X_escapeChar.'(.)';
$X_string =~ s/$re/{push(@_X_escaped,$1); $X_escapeChar}/ge;
}
return $X_string;
}
# decode escaped characters
sub _X_unescape {
my ($X_string, $X_escapeChar) = @_;
if ($X_escapeChar) { # We'll only do this if there is an $X_escapeChar!
my $re = '\\'.$X_escapeChar;
$X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped))}/ge; # Don't use Dean Edwards as below 'or' here - because zero will return ''!
# $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped) || '')}/ge;
}
return $X_string;
}
sub _X_internalEscape {
my ($string) = shift;
$string =~ s/$XX_ESCAPE//g;
return $string;
}
# Builds an array of match variables to (approximately) emulate that available in Javascript String.replace()
sub _matchVars {
my ($m,$sref) = @_;
my @args = (1..$m); # establish the number potential memory variables
my @mv = map {eval("\$$_")} @args; # matchvarv[1..m] = the memory variables $1 .. $m
unshift (@mv, $&); # matchvar[0] = the substring that matched
push (@mv, length($`)); # matchvar[m+1] = offset within the source string where the match occurred (= length of prematch string)
push (@mv, $sref); # matchvar[m+2] = reference to full source string (dereference in caller if/when needed)
#print Dumper (@mv);
return @mv;
}
sub _getPatterns {
my @Patterns = ();
my $lcp = 0;
for (my $i=0; $i<scalar(@_X_patterns); $i++) { # Loop through global all @_patterns
push (@Patterns, $_X_patterns[$i][$X_EXPRESSION]); # accumulate the expressions
$lcp += $_X_patterns[$i][$X_LENGTH]; # sum the left capturing parenthesis counts
}
my $str = "(" . join(')|(',@Patterns). ")"; # enclose each pattern in () separated by "|"
return ($str, $lcp);
}
##################
# END #
##################
1; # ParseMaster #
##################