View Javadoc
1   /*******************************************************************************
2    *   Gisgraphy Project 
3    * 
4    *   This library is free software; you can redistribute it and/or
5    *   modify it under the terms of the GNU Lesser General Public
6    *   License as published by the Free Software Foundation; either
7    *   version 2.1 of the License, or (at your option) any later version.
8    * 
9    *   This library is distributed in the hope that it will be useful,
10   *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12   *   Lesser General Public License for more details.
13   * 
14   *   You should have received a copy of the GNU Lesser General Public
15   *   License along with this library; if not, write to the Free Software
16   *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
17   * 
18   *  Copyright 2008  Gisgraphy project 
19   *  David Masclet <davidmasclet@gisgraphy.com>
20   *  
21   *  
22   *******************************************************************************/
23  /**
24   *
25   */
26  package com.gisgraphy.importer;
27  
28  import java.io.BufferedOutputStream;
29  import java.io.File;
30  import java.io.FileFilter;
31  import java.io.FileNotFoundException;
32  import java.io.FileOutputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.io.OutputStream;
36  import java.net.HttpURLConnection;
37  import java.net.MalformedURLException;
38  import java.net.ProtocolException;
39  import java.net.URL;
40  import java.net.UnknownHostException;
41  import java.util.Enumeration;
42  import java.util.List;
43  import java.util.regex.Pattern;
44  import java.util.zip.ZipEntry;
45  import java.util.zip.ZipFile;
46  
47  import org.apache.commons.httpclient.Header;
48  import org.apache.commons.httpclient.HttpClient;
49  import org.apache.commons.httpclient.HttpException;
50  import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
51  import org.apache.commons.httpclient.methods.HeadMethod;
52  import org.apache.commons.httpclient.params.HttpClientParams;
53  import org.slf4j.Logger;
54  import org.slf4j.LoggerFactory;
55  
56  import com.gisgraphy.domain.geoloc.entity.Adm;
57  import com.gisgraphy.helper.FeatureClassCodeHelper;
58  
59  /**
60   * Useful methods for importer
61   * @author <a href="mailto:david.masclet@gisgraphy.com">David Masclet</a> 
62   */
63  public class ImporterHelper {
64  
65      /**
66       * The readme filename (it must not be processed)
67       */
68      public static final String EXCLUDED_README_FILENAME = "readme.txt";
69      /**
70       * the all country dump file name
71       */
72      public static final String ALLCOUTRY_FILENAME = "allCountries.txt";
73      /**
74       * The regexp that every country file dump matches
75       */
76      public static final String GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING = "[A-Z][A-Z](.txt)";
77  
78      public static final String OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING = "(US.)[0-9]+(.txt)";
79      
80      public static final String QUATTROSHAPES_FILE_ACCEPT_REGEX_STRING = "(localities.txt)";
81  
82      public static final String SPLITED_FILE_ACCEPT_REGEX_STRING = "[A-Z][A-Z](.)[0-9]+(.txt)";
83      
84      //2 letter but not us, it is managed by SPLITED_OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING
85      public static final String SPLITED_OPENSTREETMAP_FILE_ACCEPT_REGEX_STRING = "((?!(?:US))[A-Z][A-Z])(.)[0-9]+(.txt)";
86      
87      
88      public static final String SPLITED_GEONAMES_ALTERNATENAMES_FILE_ACCEPT_REGEX_STRING = "(US.)[0-9]+(.txt)";
89      
90      public static final String SPLITED_ALLCOUNTRIES_FILE_ACCEPT_REGEX_STRING = "(allCountries)(.)[0-9]+(.txt)";
91      
92      /**
93       * The regexp that every zipped country file dump matches
94       */
95      public static final String ZIP_FILE_ACCEPT_REGEX_STRING = ".*(.zip)";
96  
97      public static final String TAR_BZ2_FILE_ACCEPT_REGEX_STRING = ".*(.tar.bz2)";
98  
99      protected static final Logger logger = LoggerFactory.getLogger(ImporterHelper.class);
100     
101     private static HttpClientParams params = new HttpClientParams(){{
102   		setConnectionManagerTimeout(2000);
103   		setSoTimeout(2000);
104   	}
105   	};
106     private static MultiThreadedHttpConnectionManager connectionManager = 	new MultiThreadedHttpConnectionManager();
107   	private static HttpClient client = new HttpClient(connectionManager){{
108   		setParams(params);
109   	}};
110 
111     public static FileFilter countryFileFilter = new FileFilter() {
112 	public boolean accept(File file) {
113 	    Pattern patternGeonames = Pattern.compile(GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING);
114 	    Pattern patternOpenStreetMapUS = Pattern.compile(OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING);
115 	    Pattern patternQuattroshapes = Pattern.compile(QUATTROSHAPES_FILE_ACCEPT_REGEX_STRING);
116 
117 	    return (file.isFile() && file.exists()) && !EXCLUDED_README_FILENAME.equals(file.getName())
118 		    && ( patternGeonames.matcher(file.getName()).matches() || ALLCOUTRY_FILENAME.equals(file.getName()) || patternOpenStreetMapUS.matcher(file.getName()).matches() || patternQuattroshapes.matcher(file.getName()).matches());
119 	}
120     };
121     
122 
123 	public static FileFilter splitedFileFilter = new FileFilter() {
124 		public boolean accept(File file) {
125 			Pattern patternSplit = Pattern.compile(SPLITED_FILE_ACCEPT_REGEX_STRING);
126 			Pattern patternAllCountriesSplit = Pattern.compile(SPLITED_ALLCOUNTRIES_FILE_ACCEPT_REGEX_STRING);
127 
128 			return (file.isFile() && file.exists()) && !EXCLUDED_README_FILENAME.equals(file.getName()) && (patternAllCountriesSplit.matcher(file.getName()).matches() || patternSplit.matcher(file.getName()).matches());
129 		}
130 	};
131 	
132 	
133     private static FileFilter ZipFileFilter = new FileFilter() {
134 	public boolean accept(File file) {
135 	    Pattern pattern = Pattern.compile(ZIP_FILE_ACCEPT_REGEX_STRING);
136 
137 	    return (file.isFile() && file.exists()) && pattern.matcher(file.getName()).matches();
138 	}
139     };
140 
141     private static FileFilter tarBZ2FileFilter = new FileFilter() {
142 	public boolean accept(File file) {
143 	    Pattern pattern = Pattern.compile(TAR_BZ2_FILE_ACCEPT_REGEX_STRING);
144 
145 	    return (file.isFile() && file.exists()) && pattern.matcher(file.getName()).matches();
146 	}
147     };
148 
149     /**
150      * @param directoryPath
151      *            The directory where files are
152      * @see #GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING
153      * @return the allcountries.txt (@see {@linkplain #ALLCOUTRY_FILENAME} file
154      *         if present or the list of country file to Import or an empty
155      *         array if there is no file
156      */
157     public static File[] listCountryFilesToImport(String directoryPath) {
158 
159 	File dir = new File(directoryPath);
160 
161 	File[] files = dir.listFiles(countryFileFilter);
162 
163 	if (files == null) {
164 	    return new File[0];
165 	}
166 
167 	for (File file : files) {
168 	    if (ALLCOUTRY_FILENAME.equals(file.getName())) {
169 		files = new File[1];
170 		files[0] = file;
171 		logger.info(ALLCOUTRY_FILENAME + " is present. Only this file will be imported. all other country files will be ignore");
172 		break;
173 	    }
174 	}
175 	
176 	if (files.length==0){
177 	    logger.warn("there is no file to import in "+directoryPath);
178 	}
179 
180 	// for Log purpose
181 	for (int i = 0; i < files.length; i++) {
182 	    logger.info(files[i].getName() + " is an importable File");
183 	}
184 	logger.info(files.length +" files are importable files");
185 
186 	return files;
187     }
188 
189     
190     
191     /**
192      * @param directoryPath
193      *            The directory where splited files are
194      * 
195      */
196     public static File[] listSplitedFilesToImport(String directoryPath) {
197 
198 	File dir = new File(directoryPath);
199 
200 	File[] files = dir.listFiles(splitedFileFilter);
201 
202 	if (files == null) {
203 	    return new File[0];
204 	}
205 
206 		
207 	if (files.length==0){
208 	    logger.warn("there is no file to import in "+directoryPath);
209 	}
210 
211 	// for Log purpose
212 	for (int i = 0; i < files.length; i++) {
213 	    logger.info(files[i].getName() + " is a Geonames splited importable File");
214 	}
215 	logger.info(files.length +" files are Geonames importable files");
216 
217 	return files;
218     }
219     
220    
221 
222     /**
223      * @param directoryPath
224      *            The directory where Geonames files are to be downloaded in
225      *            order to be processed
226      * @see #ZIP_FILE_ACCEPT_REGEX_STRING
227      * @return all the zip files present in the specified directory or an empty
228      *         array if there is no file
229      */
230     public static File[] listZipFiles(String directoryPath) {
231 
232 	File dir = new File(directoryPath);
233 
234 	File[] files = dir.listFiles(ZipFileFilter);
235 	return files == null ? new File[0] : files;
236     }
237 
238     /**
239      * @param directoryPath
240      *            The directory where openstreetmap files are to be downloaded
241      *            in order to be processed
242      * @see #TAR_BZ2_FILE_ACCEPT_REGEX_STRING
243      * @return all the zip files present in the specified directory or an empty
244      *         array if there is no file
245      */
246     public static File[] listTarFiles(String directoryPath) {
247 
248 	File dir = new File(directoryPath);
249 
250 	File[] files = dir.listFiles(tarBZ2FileFilter);
251 	return files == null ? new File[0] : files;
252     }
253 
254     
255     /**
256      * @param URL the HTTP URL
257      * @return The size of the HTTP file using HTTP head method 
258      * or -1 if error or the file doesn't exists
259      */
260     public static long getHttpFileSize(String URL){
261 	HeadMethod headMethod = new HeadMethod(URL);
262 	//we can not follow redirect because Geonames send a 302 found HTTP status code when a file doen't exists
263 	headMethod.setFollowRedirects(false);
264     try {
265     	int code = client.executeMethod(headMethod);
266     	int firstDigitOfCode = code/100;
267     	switch (firstDigitOfCode) {
268 	case 4 :
269 	    logger.error("Can not determine HTTP file size of "+URL+" because it does not exists ("+code+")");
270 	    return -1;
271 	//needed to catch 3XX code because Geonames send a 302 found HTTP status code when a file doen't exists
272 	case 3 :
273 	    logger.error("Can not determine HTTP file size of "+URL+" because it does not exists ("+code+")");
274 	    return -1;
275 	case 5:
276 	    logger.error("Can not determine HTTP file size of "+URL+" because the server send an error "+code);
277 	    return -1;
278 
279 	default:
280 	    break;
281 	}
282 	Header[] contentLengthHeaders = headMethod.getResponseHeaders("Content-Length");
283 	if (contentLengthHeaders.length ==1){
284 	    logger.info("HTTP file size of "+URL+" = "+contentLengthHeaders[0].getValue());
285 	    return new Long(contentLengthHeaders[0].getValue());
286 	} else if (contentLengthHeaders.length <= 0){
287 	    return -1L;
288 	}
289     } catch (HttpException e) {
290 	logger.error("can not execute head method for "+URL+" : "+e.getMessage(),e);
291     } catch (IOException e) {
292     	logger.error("can not execute head method for "+URL+" : "+e.getMessage(),e);
293     } finally {
294         headMethod.releaseConnection();
295     }
296     return -1;
297     }
298     
299     /**
300      * @param urlsAsString
301      * @return true if ALL the url doesn't retrun 200 or 3XX code 
302      * and are valids
303      */
304     public static boolean checkUrls(List<String> urlsAsString){
305     	if (urlsAsString==null){
306     		return false;
307     	}
308     	for (String url:urlsAsString){
309     		if (!checkUrl(url)){
310     			return false;
311     		}
312     	}
313     	return true;
314     }
315     
316     /**
317      * check if an url doesn't return 200 or 3XX code
318      * @param urlAsString the url to check
319      * @return true if the url exists and is valid
320      */
321     public static boolean checkUrl(String urlAsString){
322     	if (urlAsString==null){
323     		logger.error("can not check null URL");
324     		return false;
325     	}
326     	URL url;
327 		try {
328 			url = new URL(urlAsString);
329 		} catch (MalformedURLException e) {
330 			logger.error(urlAsString+" is not a valid url, can not check.");
331 			return false;
332 		}
333     	int responseCode;
334     	String responseMessage = "NO RESPONSE MESSAGE";
335     	Object content = "NO CONTENT";
336     	HttpURLConnection huc;
337 		try {
338 			huc = (HttpURLConnection) url.openConnection();
339 			huc.setRequestMethod("HEAD");
340 			responseCode = huc.getResponseCode();
341 			content = huc.getContent();
342 			responseMessage = huc.getResponseMessage();
343 		} catch (ProtocolException e) {
344 			logger.error("can not check url "+e.getMessage(),e);
345 			return false;
346 		} catch (IOException e) {
347 			logger.error("can not check url "+e.getMessage(),e);
348 			return false;
349 		}
350 
351     	if (responseCode == 200 || (responseCode >300 &&  responseCode < 400)) {
352     		logger.info("URL "+urlAsString+ " exists");
353     		return true;
354     	} else {
355     		logger.error(urlAsString+" return a "+responseCode+" : "+content+"/"+responseMessage);
356     	return false;
357     	}
358     }
359     
360     /**
361      * @param address
362      *            the address of the file to be downloaded
363      * @param localFileName
364      *            the local file name (with absolute path)
365      */
366     public static void download(String address, String localFileName) throws  FileNotFoundException{
367 	logger.info("download file " + address + " to " + localFileName);
368 	OutputStream out = null;
369 	HttpURLConnection conn = null;
370 	InputStream in = null;
371 	try {
372 	    URL url = new URL(address);
373 	    conn = (HttpURLConnection) url.openConnection();
374 	    if (conn instanceof HttpURLConnection) {
375     	((HttpURLConnection) conn).setInstanceFollowRedirects(false);
376 	    	
377 		int responseCode = ((HttpURLConnection) conn).getResponseCode();
378 		//manage most frequent error code and Gisgraphy specific one
379 		switch (responseCode) {
380 		case 509:
381 		    throw new RuntimeException("Sorry, there is too many users connected for "+address+", this site has limmited resources, please try again later");
382 		case 500:
383 		    throw new RuntimeException("Sorry, the server return an 500 status code for "+address+", an internal error has occured");
384 		case 404:
385 		    throw new FileNotFoundException("Sorry, the server return an 404 status code for "+address+", the file probably not exists or the URL is not correct");
386 		case 302:
387 		    throw new FileNotFoundException("Sorry, the server return an 302 status code for "+address+", the file is not at the correct URL");
388 		default:
389 		    break;
390 		}
391 		
392 	    }
393 	    in = conn.getInputStream();
394 	    out = new BufferedOutputStream(new FileOutputStream(localFileName));
395 	    byte[] buffer = new byte[1024];
396 	    int numRead;
397 	    long numWritten = 0;
398 	    while ((numRead = in.read(buffer)) != -1) {
399 		out.write(buffer, 0, numRead);
400 		numWritten += numRead;
401 	    }
402 	    logger.info(localFileName + "\t" + numWritten);
403 	} catch (UnknownHostException e) {
404 	    String errorMessage = "can not download " + address + " to " + localFileName + " : " + e.getMessage() + ". if the host exists and is reachable," + " maybe this links can help : http://www.gisgraphy.com/forum/viewtopic.php?f=3&t=64 ";
405 	    logger.warn(errorMessage);
406 	    throw new ImporterException(errorMessage, e);
407 	} catch (FileNotFoundException e) {
408 	    throw e;
409 	} catch (Exception e) {
410 	    logger.warn("can not download " + address + " to " + localFileName + " : " + e.getMessage());
411 	    throw new ImporterException(e);
412 	} finally {
413 	    try {
414 		if (in != null) {
415 		    in.close();
416 		}
417 		if (out != null) {
418 		    out.flush();
419 		    out.close();
420 		}
421 	    } catch (IOException ioe) {
422 		logger.error("cannot close streams");
423 	    }
424 	}
425     }
426 
427     /**
428      * unzip a file in the same directory as the zipped file
429      * 
430      * @param file
431      *            The file to unzip
432      */
433     public static void unzipFile(File file) {
434 	logger.info("will Extracting file: " + file.getName());
435 	Enumeration<? extends ZipEntry> entries;
436 	ZipFile zipFile;
437 
438 	try {
439 	    zipFile = new ZipFile(file);
440 
441 	    entries = zipFile.entries();
442 
443 	    while (entries.hasMoreElements()) {
444 		ZipEntry entry = (ZipEntry) entries.nextElement();
445 
446 		if (entry.isDirectory()) {
447 		    // Assume directories are stored parents first then
448 		    // children.
449 		    (new File(entry.getName())).mkdir();
450 		    continue;
451 		}
452 
453 		logger.info("Extracting file: " + entry.getName() + " to " + file.getParent() + File.separator + entry.getName());
454 		copyInputStream(zipFile.getInputStream(entry), new BufferedOutputStream(new FileOutputStream(file.getParent() + File.separator + entry.getName())));
455 	    }
456 
457 	    zipFile.close();
458 	} catch (IOException e) {
459 	    logger.error("can not unzip " + file.getName() + " : " + e.getMessage(),e);
460 	    throw new ImporterException(e);
461 	}
462     }
463 
464     private static final void copyInputStream(InputStream in, OutputStream out) throws IOException {
465 	byte[] buffer = new byte[1024];
466 	int len;
467 
468 	while ((len = in.read(buffer)) >= 0) {
469 	    out.write(buffer, 0, len);
470 	}
471 
472 	in.close();
473 	out.close();
474     }
475 
476     /**
477      * @param fields
478      *            the fields corresponding to a split line of the csv geonames file
479      * @return the modified fields whith the feature code change to
480      *         ADM1,ADM2,ADM3,ADM4 according to the ADMcodes. e.g id adm1code
481      *         and Adm2 code are not null : the feature code will be change to
482      *         ADM2.
483      */
484     public static String[] virtualizeADMD(String[] fields) {
485 	if (fields[7] != null && "ADMD".equals(fields[7]) && fields[6] != null && "A".equals(fields[6])) {
486 	    // it is an ADMD, will try to detect level
487 	    int level = Adm.getProcessedLevelFromCodes(fields[10], fields[11], fields[12], fields[13]);
488 	    if (level != 0) {
489 		fields[7] = "ADM" + level;
490 	    }
491 	}
492 	return fields;
493 
494     }
495 
496     public static String[] correctLastAdmCodeIfPossible(String[] fields) {
497 	if (FeatureClassCodeHelper.is_Adm(fields[6], fields[7]) && !AbstractSimpleImporterProcessor.isEmptyField(fields, 0, false)) {
498 	    int level = Adm.getProcessedLevelFromFeatureClassCode(fields[6], fields[7]);
499 	    switch (level) {
500 	    case 0:
501 		return fields;
502 	    case 1:
503 		if (AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false)) {
504 		    fields[10] = fields[0];// asign adm1code with featureid
505 		}
506 		return fields;
507 	    case 2:
508 		if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false) && AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false)) {
509 		    fields[11] = fields[0];// asign adm2code with featureid
510 		}
511 		return fields;
512 	    case 3:
513 		if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false) && !AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false) && AbstractSimpleImporterProcessor.isEmptyField(fields, 12, false)) {
514 		    fields[12] = fields[0];// asign adm3code with featureid
515 		}
516 		return fields;
517 	    case 4:
518 		if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false) && !AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false) && !AbstractSimpleImporterProcessor.isEmptyField(fields, 12, false)
519 			&& AbstractSimpleImporterProcessor.isEmptyField(fields, 13, false)) {
520 		    fields[13] = fields[0];// asign adm4code with featureid
521 		}
522 		return fields;
523 
524 	    default:
525 		return fields;
526 	    }
527 
528 	}
529 	return fields;
530     }
531 
532     /**
533      * @param regexp
534      *            a regexp
535      * @return A {@link Pattern} or null if the regexp are not corrects
536      */
537     public static Pattern compileRegex(String regexp) {
538 	try {
539 	   	if (regexp != null && !regexp.trim().equals("")) {
540 		    return Pattern.compile(regexp,Pattern.CASE_INSENSITIVE);
541 		} else {
542 			return null;
543 		}
544 	} catch (RuntimeException e) {
545 	    return null;
546 	}
547     }
548 
549     /**
550      * @param secsIn
551      *            the number of seconds
552      * @return a human reading strings. example :1 hour 6 minuts 40 seconds.
553      */
554     public static String formatSeconds(long secsIn) {
555 
556 	long hours = secsIn / 3600,
557 
558 	remainder = secsIn % 3600, minutes = remainder / 60, seconds = remainder % 60;
559 	String displayhours = hours == 0 ? "" : hours + " hour" + getPlural(hours);
560 	String displayMin = minutes == 0 ? "" : minutes + " minut" + getPlural(minutes);
561 	String displaySec = seconds == 0 ? "" : seconds + " second" + getPlural(seconds);
562 	return displayhours + displayMin + displaySec;
563     }
564 
565     private static String getPlural(long count) {
566 	return count > 1 ? "s " : " ";
567     }
568 
569 }