View Javadoc
1   /*******************************************************************************
2    * Gisgraphy Project 
3    *  
4    *    This library is free software; you can redistribute it and/or
5    *    modify it under the terms of the GNU Lesser General Public
6    *    License as published by the Free Software Foundation; either
7    *    version 2.1 of the License, or (at your option) any later version.
8    *  
9    *    This library is distributed in the hope that it will be useful,
10   *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11   *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12   *    Lesser General Public License for more details.
13   *  
14   *    You should have received a copy of the GNU Lesser General Public
15   *    License along with this library; if not, write to the Free Software
16   *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
17   *  
18   *   Copyright 2008  Gisgraphy project 
19   * 
20   *   David Masclet <davidmasclet@gisgraphy.com>
21   ******************************************************************************/
22  package com.gisgraphy.geoloc;
23  
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  import org.slf4j.Logger;
28  import org.slf4j.LoggerFactory;
29  
30  public class ZipcodeNormalizer {
31      private static Logger logger = LoggerFactory.getLogger(ZipcodeNormalizer.class);
32  
33      private final static int REGEXP_CASEINSENSITIVE_FLAG = Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
34      private final static String CA_PATTERN_EXPRESSION = "(?<=[a-z]\\d[a-z])[\\s–\\-]?\\d[a-z]\\d";
35      private final static Pattern CA_PATTERN = Pattern.compile(CA_PATTERN_EXPRESSION, REGEXP_CASEINSENSITIVE_FLAG);
36      
37      private final static String GB_PATTERN_EXPRESSION = "(?<=[A-Z]{2}\\d[A-Z])\\s?\\d[A-Z]{2}|(?<=[A-Z]{2}\\d{2})\\s?\\d[A-Z]{2}|(?<=[A-Z]\\d)\\s?\\d[A-Z]{2}|(?<=[A-Z]{2}\\d)\\s?\\d[A-Z]{2}|(?<=[A-Z]\\d[A-Z])\\s?\\d[A-Z]{2}|(?<=[A-Z]\\d{2})\\s?\\d[A-Z]{2}|(?<=GIR)\\s?0AA|(?<=[A-Z]{4})\\s?1ZZ";
38    //LLNL NLL|LLNN NLL|LN NLL|LLN NLL|LNL NLL|LNN NLL|
39      private final static Pattern GB_PATTERN = Pattern.compile(GB_PATTERN_EXPRESSION, REGEXP_CASEINSENSITIVE_FLAG);
40  
41      public static String normalize_ca(String string) {
42  	return normalize_contry(string, CA_PATTERN);
43      }
44      
45      public static String normalize_gb(String string) {
46  	return normalize_contry(string, GB_PATTERN);
47      }
48      
49      /**
50       * @return a string that prepare zipcode to be search
51       * because for canada we only got first char and so does for GB
52       */
53      public static String normalize(String string,String countryCode){
54  	if (string==null){
55  	    return null;
56  	}
57  	if (countryCode == null || "".equals(countryCode.trim())){
58  	    return  normalize_ca(normalize_gb(string));
59  	} else if("GB".equalsIgnoreCase(countryCode)){
60  	    return normalize_gb(string);
61  	} else if ("CA".equalsIgnoreCase(countryCode)){
62  	    return normalize_ca(string); 
63  	} else {
64  	    return string;
65  	}
66  	
67      }
68  
69      private static String normalize_contry(String string, Pattern pattern) {
70  	if (string==null){
71  	    return null;
72  	}
73  	Matcher matcher = pattern.matcher(string);
74  
75  	if (logger.isInfoEnabled()) {
76  	    if (matcher.find()) {
77  		logger.info("found one or more zipcode to normalize");
78  		String[] splitedString = new String[matcher.groupCount()];
79  		for (int j = 1; j <= matcher.groupCount(); j++) {
80  		    String group = matcher.group(j);
81  		    if (group != null) {
82  			group = group.trim();
83  		    }
84  		    splitedString[j - 1] = group;
85  		     if (logger.isInfoEnabled()) {
86  		     logger.info("[" + (j - 1) + "]=" + group);
87  		     }
88  		}
89  	    }
90  	}
91  	return pattern.matcher(string).replaceAll("").trim();
92      }
93  
94     
95  }