001/* Matcher.java -- Instance of a regular expression applied to a char sequence.
002   Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
003
004This file is part of GNU Classpath.
005
006GNU Classpath is free software; you can redistribute it and/or modify
007it under the terms of the GNU General Public License as published by
008the Free Software Foundation; either version 2, or (at your option)
009any later version.
010
011GNU Classpath is distributed in the hope that it will be useful, but
012WITHOUT ANY WARRANTY; without even the implied warranty of
013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014General Public License for more details.
015
016You should have received a copy of the GNU General Public License
017along with GNU Classpath; see the file COPYING.  If not, write to the
018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
01902110-1301 USA.
020
021Linking this library statically or dynamically with other modules is
022making a combined work based on this library.  Thus, the terms and
023conditions of the GNU General Public License cover the whole
024combination.
025
026As a special exception, the copyright holders of this library give you
027permission to link this library with independent modules to produce an
028executable, regardless of the license terms of these independent
029modules, and to copy and distribute the resulting executable under
030terms of your choice, provided that you also meet, for each linked
031independent module, the terms and conditions of the license of that
032module.  An independent module is a module which is not derived from
033or based on this library.  If you modify this library, you may extend
034this exception to your version of the library, but you are not
035obligated to do so.  If you do not wish to do so, delete this
036exception statement from your version. */
037
038
039package java.util.regex;
040
041import gnu.java.lang.CPStringBuilder;
042
043import gnu.java.util.regex.CharIndexed;
044import gnu.java.util.regex.RE;
045import gnu.java.util.regex.REMatch;
046
047/**
048 * Instance of a regular expression applied to a char sequence.
049 *
050 * @since 1.4
051 */
052public final class Matcher implements MatchResult
053{
054  private Pattern pattern;
055  private CharSequence input;
056  // We use CharIndexed as an input object to the getMatch method in order
057  // that /\G/ (the end of the previous match) may work.  The information
058  // of the previous match is stored in the CharIndexed object.
059  private CharIndexed inputCharIndexed;
060  private int position;
061  private int appendPosition;
062  private REMatch match;
063
064  /**
065   * The start of the region of the input on which to match.
066   */
067  private int regionStart;
068
069  /**
070   * The end of the region of the input on which to match.
071   */
072  private int regionEnd;
073
074  /**
075   * True if the match process should look beyond the
076   * region marked by regionStart to regionEnd when
077   * performing lookAhead, lookBehind and boundary
078   * matching.
079   */
080  private boolean transparentBounds;
081
082  /**
083   * The flags that affect the anchoring bounds.
084   * If {@link #hasAnchoringBounds()} is {@code true},
085   * the match process will honour the
086   * anchoring bounds: ^, \A, \Z, \z and $.  If
087   * {@link #hasAnchoringBounds()} is {@code false},
088   * the anchors are ignored and appropriate flags,
089   * stored in this variable, are used to provide this
090   * behaviour.
091   */
092  private int anchoringBounds;
093
094  Matcher(Pattern pattern, CharSequence input)
095  {
096    this.pattern = pattern;
097    this.input = input;
098    this.inputCharIndexed = RE.makeCharIndexed(input, 0);
099    regionStart = 0;
100    regionEnd = input.length();
101    transparentBounds = false;
102    anchoringBounds = 0;
103  }
104
105  /**
106   * Changes the pattern used by the {@link Matcher} to
107   * the one specified.  Existing match information is lost,
108   * but the input and the matcher's position within it is
109   * retained.
110   *
111   * @param newPattern the new pattern to use.
112   * @return this matcher.
113   * @throws IllegalArgumentException if {@code newPattern} is
114   *                                  {@code null}.
115   * @since 1.5
116   */
117  public Matcher usePattern(Pattern newPattern)
118  {
119    if (newPattern == null)
120      throw new IllegalArgumentException("The new pattern was null.");
121    pattern = newPattern;
122    match = null;
123
124    return this;
125  }
126
127  /**
128   * @param sb The target string buffer
129   * @param replacement The replacement string
130   *
131   * @exception IllegalStateException If no match has yet been attempted,
132   * or if the previous match operation failed
133   * @exception IndexOutOfBoundsException If the replacement string refers
134   * to a capturing group that does not exist in the pattern
135   */
136  public Matcher appendReplacement (StringBuffer sb, String replacement)
137    throws IllegalStateException
138  {
139    assertMatchOp();
140    sb.append(input.subSequence(appendPosition,
141                                match.getStartIndex()).toString());
142    sb.append(RE.getReplacement(replacement, match,
143        RE.REG_REPLACE_USE_BACKSLASHESCAPE));
144    appendPosition = match.getEndIndex();
145    return this;
146  }
147
148  /**
149   * @param sb The target string buffer
150   */
151  public StringBuffer appendTail (StringBuffer sb)
152  {
153    sb.append(input.subSequence(appendPosition, input.length()).toString());
154    return sb;
155  }
156
157  /**
158   * @exception IllegalStateException If no match has yet been attempted,
159   * or if the previous match operation failed
160   */
161  public int end ()
162    throws IllegalStateException
163  {
164    assertMatchOp();
165    return match.getEndIndex();
166  }
167
168  /**
169   * @param group The index of a capturing group in this matcher's pattern
170   *
171   * @exception IllegalStateException If no match has yet been attempted,
172   * or if the previous match operation failed
173   * @exception IndexOutOfBoundsException If the replacement string refers
174   * to a capturing group that does not exist in the pattern
175   */
176  public int end (int group)
177    throws IllegalStateException
178  {
179    assertMatchOp();
180    return match.getEndIndex(group);
181  }
182
183  public boolean find ()
184  {
185    boolean first = (match == null);
186    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
187      match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
188    else
189      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
190                                       position, anchoringBounds);
191    if (match != null)
192      {
193        int endIndex = match.getEndIndex();
194        // Is the match within input limits?
195        if (endIndex > input.length())
196          {
197            match = null;
198            return false;
199          }
200        // Are we stuck at the same position?
201        if (!first && endIndex == position)
202          {
203            match = null;
204            // Not at the end of the input yet?
205            if (position < input.length() - 1)
206              {
207                position++;
208                return find(position);
209              }
210            else
211              return false;
212          }
213        position = endIndex;
214        return true;
215      }
216    return false;
217  }
218
219  /**
220   * @param start The index to start the new pattern matching
221   *
222   * @exception IndexOutOfBoundsException If the replacement string refers
223   * to a capturing group that does not exist in the pattern
224   */
225  public boolean find (int start)
226  {
227    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
228      match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
229    else
230      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
231                                       start, anchoringBounds);
232    if (match != null)
233      {
234        position = match.getEndIndex();
235        return true;
236      }
237    return false;
238  }
239
240  /**
241   * @exception IllegalStateException If no match has yet been attempted,
242   * or if the previous match operation failed
243   */
244  public String group ()
245  {
246    assertMatchOp();
247    return match.toString();
248  }
249
250  /**
251   * @param group The index of a capturing group in this matcher's pattern
252   *
253   * @exception IllegalStateException If no match has yet been attempted,
254   * or if the previous match operation failed
255   * @exception IndexOutOfBoundsException If the replacement string refers
256   * to a capturing group that does not exist in the pattern
257   */
258  public String group (int group)
259    throws IllegalStateException
260  {
261    assertMatchOp();
262    return match.toString(group);
263  }
264
265  /**
266   * @param replacement The replacement string
267   */
268  public String replaceFirst (String replacement)
269  {
270    reset();
271    // Semantics might not quite match
272    return pattern.getRE().substitute(input, replacement, position,
273        RE.REG_REPLACE_USE_BACKSLASHESCAPE);
274  }
275
276  /**
277   * @param replacement The replacement string
278   */
279  public String replaceAll (String replacement)
280  {
281    reset();
282    return pattern.getRE().substituteAll(input, replacement, position,
283        RE.REG_REPLACE_USE_BACKSLASHESCAPE);
284  }
285
286  public int groupCount ()
287  {
288    return pattern.getRE().getNumSubs();
289  }
290
291  public boolean lookingAt ()
292  {
293    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
294      match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
295                                       anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
296    else
297      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
298                                       anchoringBounds|RE.REG_FIX_STARTING_POSITION);
299    if (match != null)
300      {
301        if (match.getStartIndex() == 0)
302          {
303            position = match.getEndIndex();
304            return true;
305          }
306        match = null;
307      }
308    return false;
309  }
310
311  /**
312   * Attempts to match the entire input sequence against the pattern.
313   *
314   * If the match succeeds then more information can be obtained via the
315   * start, end, and group methods.
316   *
317   * @see #start()
318   * @see #end()
319   * @see #group()
320   */
321  public boolean matches ()
322  {
323    if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
324      match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
325                                       anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
326    else
327      match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
328                                       anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
329    if (match != null)
330      {
331        if (match.getStartIndex() == 0)
332          {
333            position = match.getEndIndex();
334            if (position == input.length())
335                return true;
336          }
337        match = null;
338      }
339    return false;
340  }
341
342  /**
343   * Returns the Pattern that is interpreted by this Matcher
344   */
345  public Pattern pattern ()
346  {
347    return pattern;
348  }
349
350  /**
351   * Resets the internal state of the matcher, including
352   * resetting the region to its default state of encompassing
353   * the whole input.  The state of {@link #hasTransparentBounds()}
354   * and {@link #hasAnchoringBounds()} are unaffected.
355   *
356   * @return a reference to this matcher.
357   * @see #regionStart()
358   * @see #regionEnd()
359   * @see #hasTransparentBounds()
360   * @see #hasAnchoringBounds()
361   */
362  public Matcher reset ()
363  {
364    position = 0;
365    match = null;
366    regionStart = 0;
367    regionEnd = input.length();
368    appendPosition = 0;
369    return this;
370  }
371
372  /**
373   * Resets the internal state of the matcher, including
374   * resetting the region to its default state of encompassing
375   * the whole input.  The state of {@link #hasTransparentBounds()}
376   * and {@link #hasAnchoringBounds()} are unaffected.
377   *
378   * @param input The new input character sequence.
379   * @return a reference to this matcher.
380   * @see #regionStart()
381   * @see #regionEnd()
382   * @see #hasTransparentBounds()
383   * @see #hasAnchoringBounds()
384   */
385  public Matcher reset (CharSequence input)
386  {
387    this.input = input;
388    this.inputCharIndexed = RE.makeCharIndexed(input, 0);
389    return reset();
390  }
391
392  /**
393   * @return the index of a capturing group in this matcher's pattern
394   *
395   * @exception IllegalStateException If no match has yet been attempted,
396   * or if the previous match operation failed
397   */
398  public int start ()
399    throws IllegalStateException
400  {
401    assertMatchOp();
402    return match.getStartIndex();
403  }
404
405  /**
406   * @param group The index of a capturing group in this matcher's pattern
407   *
408   * @exception IllegalStateException If no match has yet been attempted,
409   * or if the previous match operation failed
410   * @exception IndexOutOfBoundsException If the replacement string refers
411   * to a capturing group that does not exist in the pattern
412   */
413  public int start (int group)
414    throws IllegalStateException
415  {
416    assertMatchOp();
417    return match.getStartIndex(group);
418  }
419
420  /**
421   * @return True if and only if the matcher hit the end of input.
422   * @since 1.5
423   */
424  public boolean hitEnd()
425  {
426    return inputCharIndexed.hitEnd();
427  }
428
429  /**
430   * @return A string expression of this matcher.
431   */
432  public String toString()
433  {
434    CPStringBuilder sb = new CPStringBuilder();
435    sb.append(this.getClass().getName())
436      .append("[pattern=").append(pattern.pattern())
437      .append(" region=").append(regionStart).append(",").append(regionEnd)
438      .append(" anchoringBounds=").append(anchoringBounds == 0)
439      .append(" transparentBounds=").append(transparentBounds)
440      .append(" lastmatch=").append(match == null ? "" : match.toString())
441      .append("]");
442    return sb.toString();
443  }
444
445  private void assertMatchOp()
446  {
447    if (match == null) throw new IllegalStateException();
448  }
449
450  /**
451   * <p>
452   * Defines the region of the input on which to match.
453   * By default, the {@link Matcher} attempts to match
454   * the whole string (from 0 to the length of the input),
455   * but a region between {@code start} (inclusive) and
456   * {@code end} (exclusive) on which to match may instead
457   * be defined using this method.
458   * </p>
459   * <p>
460   * The behaviour of region matching is further affected
461   * by the use of transparent or opaque bounds (see
462   * {@link #useTransparentBounds(boolean)}) and whether or not
463   * anchors ({@code ^} and {@code $}) are in use
464   * (see {@link #useAnchoringBounds(boolean)}).  With transparent
465   * bounds, the matcher is aware of input outside the bounds
466   * set by this method, whereas, with opaque bounds (the default)
467   * only the input within the bounds is used.  The use of
468   * anchors are affected by this setting; with transparent
469   * bounds, anchors will match the beginning of the real input,
470   * while with opaque bounds they match the beginning of the
471   * region.  {@link #useAnchoringBounds(boolean)} can be used
472   * to turn on or off the matching of anchors.
473   * </p>
474   *
475   * @param start the start of the region (inclusive).
476   * @param end the end of the region (exclusive).
477   * @return a reference to this matcher.
478   * @throws IndexOutOfBoundsException if either {@code start} or
479   *                                   {@code end} are less than zero,
480   *                                   if either {@code start} or
481   *                                   {@code end} are greater than the
482   *                                   length of the input, or if
483   *                                   {@code start} is greater than
484   *                                   {@code end}.
485   * @see #regionStart()
486   * @see #regionEnd()
487   * @see #hasTransparentBounds()
488   * @see #useTransparentBounds(boolean)
489   * @see #hasAnchoringBounds()
490   * @see #useAnchoringBounds(boolean)
491   * @since 1.5
492   */
493  public Matcher region(int start, int end)
494  {
495    int length = input.length();
496    if (start < 0)
497      throw new IndexOutOfBoundsException("The start position was less than zero.");
498    if (start >= length)
499      throw new IndexOutOfBoundsException("The start position is after the end of the input.");
500    if (end < 0)
501      throw new IndexOutOfBoundsException("The end position was less than zero.");
502    if (end > length)
503      throw new IndexOutOfBoundsException("The end position is after the end of the input.");
504    if (start > end)
505      throw new IndexOutOfBoundsException("The start position is after the end position.");
506    reset();
507    regionStart = start;
508    regionEnd = end;
509    return this;
510  }
511
512  /**
513   * The start of the region on which to perform matches (inclusive).
514   *
515   * @return the start index of the region.
516   * @see #region(int,int)
517   * #see #regionEnd()
518   * @since 1.5
519   */
520  public int regionStart()
521  {
522    return regionStart;
523  }
524
525  /**
526   * The end of the region on which to perform matches (exclusive).
527   *
528   * @return the end index of the region.
529   * @see #region(int,int)
530   * @see #regionStart()
531   * @since 1.5
532   */
533  public int regionEnd()
534  {
535    return regionEnd;
536  }
537
538  /**
539   * Returns true if the bounds of the region marked by
540   * {@link #regionStart()} and {@link #regionEnd()} are
541   * transparent.  When these bounds are transparent, the
542   * matching process can look beyond them to perform
543   * lookahead, lookbehind and boundary matching operations.
544   * By default, the bounds are opaque.
545   *
546   * @return true if the bounds of the matching region are
547   *         transparent.
548   * @see #useTransparentBounds(boolean)
549   * @see #region(int,int)
550   * @see #regionStart()
551   * @see #regionEnd()
552   * @since 1.5
553   */
554  public boolean hasTransparentBounds()
555  {
556    return transparentBounds;
557  }
558
559  /**
560   * Sets the transparency of the bounds of the region
561   * marked by {@link #regionStart()} and {@link #regionEnd()}.
562   * A value of {@code true} makes the bounds transparent,
563   * so the matcher can see beyond them to perform lookahead,
564   * lookbehind and boundary matching operations.  A value
565   * of {@code false} (the default) makes the bounds opaque,
566   * restricting the match to the input region denoted
567   * by {@link #regionStart()} and {@link #regionEnd()}.
568   *
569   * @param transparent true if the bounds should be transparent.
570   * @return a reference to this matcher.
571   * @see #hasTransparentBounds()
572   * @see #region(int,int)
573   * @see #regionStart()
574   * @see #regionEnd()
575   * @since 1.5
576   */
577  public Matcher useTransparentBounds(boolean transparent)
578  {
579    transparentBounds = transparent;
580    return this;
581  }
582
583  /**
584   * Returns true if the matcher will honour the use of
585   * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
586   * {@code \z} and {@code $}.  By default, the anchors
587   * are used.  Note that the effect of the anchors is
588   * also affected by {@link #hasTransparentBounds()}.
589   *
590   * @return true if the matcher will attempt to match
591   *         the anchoring bounds.
592   * @see #useAnchoringBounds(boolean)
593   * @see #hasTransparentBounds()
594   * @since 1.5
595   */
596  public boolean hasAnchoringBounds()
597  {
598    return anchoringBounds == 0;
599  }
600
601  /**
602   * Enables or disables the use of the anchoring bounds:
603   * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
604   * {@code $}. By default, their use is enabled.  When
605   * disabled, the matcher will not attempt to match
606   * the anchors.
607   *
608   * @param useAnchors true if anchoring bounds should be used.
609   * @return a reference to this matcher.
610   * @since 1.5
611   * @see #hasAnchoringBounds()
612   */
613  public Matcher useAnchoringBounds(boolean useAnchors)
614  {
615    if (useAnchors)
616      anchoringBounds = 0;
617    else
618      anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
619    return this;
620  }
621
622  /**
623   * Returns a read-only snapshot of the current state of
624   * the {@link Matcher} as a {@link MatchResult}.  Any
625   * subsequent changes to this instance are not reflected
626   * in the returned {@link MatchResult}.
627   *
628   * @return a {@link MatchResult} instance representing the
629   *         current state of the {@link Matcher}.
630   */
631  public MatchResult toMatchResult()
632  {
633    Matcher snapshot = new Matcher(pattern, input);
634    if (match != null)
635      snapshot.match = (REMatch) match.clone();
636    return snapshot;
637  }
638
639  /**
640   * Returns a literalized string of s where characters {@code $} and {@code
641   * \\} are escaped.
642   *
643   * @param s the string to literalize.
644   * @return the literalized string.
645   * @since 1.5
646   */
647  public static String quoteReplacement(String s)
648  {
649    if (s == null)
650      throw new NullPointerException();
651    CPStringBuilder sb = new CPStringBuilder();
652    for (int i = 0; i < s.length(); i++)
653    {
654      char ch = s.charAt(i);
655      if (ch == '$' || ch == '\\')
656        sb.append('\\');
657      sb.append(ch);
658    }
659    return sb.toString();
660  }
661
662}