From 5c348bb2833f066a45b4511758ad21bff0a17543 Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Tue, 19 Jan 2016 23:13:14 +0200 Subject: [PATCH 1/9] Implemented ConnectionUtils --- org/geekhub/ConnectionUtils.java | 22 ----------------- src/org/geekhub/ConnectionUtils.java | 37 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 22 deletions(-) delete mode 100644 org/geekhub/ConnectionUtils.java create mode 100644 src/org/geekhub/ConnectionUtils.java diff --git a/org/geekhub/ConnectionUtils.java b/org/geekhub/ConnectionUtils.java deleted file mode 100644 index f67536f..0000000 --- a/org/geekhub/ConnectionUtils.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.geekhub; - -import java.io.IOException; -import java.net.URL; - -/** - * Utils class that contains useful method to interact with URLConnection - */ -public class ConnectionUtils { - - /** - * Downloads content for specified URL and returns it as a byte array. - * Should be used for small files only. Don't use it to download big files it's dangerous. - * @param url - * @return - * @throws IOException - */ - public static byte[] getData(URL url) throws IOException { - //implement me - return null; - } -} diff --git a/src/org/geekhub/ConnectionUtils.java b/src/org/geekhub/ConnectionUtils.java new file mode 100644 index 0000000..b46cb9d --- /dev/null +++ b/src/org/geekhub/ConnectionUtils.java @@ -0,0 +1,37 @@ +package org.geekhub; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; + +/** + * Utils class that contains useful method to interact with URLConnection + */ +public class ConnectionUtils { + + public static final int BUFFER_SIZE = 8 * 1024; + public static final int MAX_CONTENT_SIZE = 1024 * 1024; + + /** + * Downloads content for specified URL and returns it as a byte array. + * Should be used for small files only. Don't use it to download big files it's dangerous. + * @param url to download content from + * @return a byte array + * @throws IOException + */ + public static byte[] getData(URL url) throws IOException { + byte[] bytes; + StringBuilder content = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()), BUFFER_SIZE)){ + String line; + int size = 0; + while ((line = reader.readLine()) != null && size < MAX_CONTENT_SIZE) { + content.append(line); + size += line.length(); + } + bytes = content.toString().getBytes(); + } + return bytes; + } +} From e611bab52d8ea2770f8b214dd8aec02ae76418ec Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 03:44:09 +0200 Subject: [PATCH 2/9] Implemented ImageTask --- {org => src/org}/geekhub/ImageTask.java | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) rename {org => src/org}/geekhub/ImageTask.java (51%) diff --git a/org/geekhub/ImageTask.java b/src/org/geekhub/ImageTask.java similarity index 51% rename from org/geekhub/ImageTask.java rename to src/org/geekhub/ImageTask.java index de0a340..03f0aea 100644 --- a/org/geekhub/ImageTask.java +++ b/src/org/geekhub/ImageTask.java @@ -1,12 +1,18 @@ package org.geekhub; +import java.io.*; import java.net.URL; +import java.nio.file.Path; +import java.nio.file.Paths; /** * Represents worker that downloads image from URL to specified folder.
* Name of the image will be constructed based on URL. Names for the same URL will be the same. */ public class ImageTask implements Runnable { + + public static final int BUFFER_SIZE = 8 * 1024; + private URL url; private String folder; @@ -20,7 +26,19 @@ public ImageTask(URL url, String folder) { */ @Override public void run() { - //implement me + Path path = Paths.get(folder, buildFileName(url)); + try (BufferedInputStream inputStream = new BufferedInputStream(url.openStream(), BUFFER_SIZE); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + FileOutputStream file = new FileOutputStream(path.toFile())){ + int bytes; + while((bytes = inputStream.read()) >= 0) { + outputStream.write(bytes); + } + outputStream.writeTo(file); + } catch (IOException e) { + e.printStackTrace(); + } + } //converts URL to unique file name From 62c56a7d7fbc596d2ec3e3da7faaa65e7dd14e1d Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 03:45:09 +0200 Subject: [PATCH 3/9] Implemented ImageCrawler --- {org => src/org}/geekhub/ImageCrawler.java | 27 ++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) rename {org => src/org}/geekhub/ImageCrawler.java (64%) diff --git a/org/geekhub/ImageCrawler.java b/src/org/geekhub/ImageCrawler.java similarity index 64% rename from org/geekhub/ImageCrawler.java rename to src/org/geekhub/ImageCrawler.java index 8cad33b..e2e01a2 100644 --- a/org/geekhub/ImageCrawler.java +++ b/src/org/geekhub/ImageCrawler.java @@ -1,8 +1,10 @@ package org.geekhub; -import java.io.*; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.Arrays; +import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -13,6 +15,8 @@ */ public class ImageCrawler { + private List imageExtensions = Arrays.asList("jpg", "jpeg", "bmp", "gif", "png", "tiff", "tif"); + //number of threads to download images simultaneously public static final int NUMBER_OF_THREADS = 10; @@ -29,7 +33,10 @@ public ImageCrawler(String folder) throws MalformedURLException { * @throws IOException */ public void downloadImages(String urlToPage) throws IOException { - //implement me + new Page(new URL(urlToPage)).getImageLinks() + .stream() + .filter(this::isImageURL) + .forEach(link -> executorService.execute(new ImageTask(link, folder))); } /** @@ -41,10 +48,16 @@ public void stop() { //detects is current url is an image. Checking for popular extensions should be enough private boolean isImageURL(URL url) { - //implement me - return false; + String path = url.getFile(); + int index = path.lastIndexOf("."); + String ext; + if (index > 0) { + ext = path.substring(index + 1); + } else { + return false; + } + return imageExtensions + .stream() + .anyMatch(item -> item.equals(ext)); } - - - } From 8d0be5ccb2da65ba1a395950e9276a40fc2b9521 Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 03:47:36 +0200 Subject: [PATCH 4/9] Added time test --- {org => src/org}/geekhub/Main.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename {org => src/org}/geekhub/Main.java (77%) diff --git a/org/geekhub/Main.java b/src/org/geekhub/Main.java similarity index 77% rename from org/geekhub/Main.java rename to src/org/geekhub/Main.java index 7cff652..8200722 100644 --- a/org/geekhub/Main.java +++ b/src/org/geekhub/Main.java @@ -5,12 +5,17 @@ public class Main { - public static final String FOLDER_TO_DOWNLOAD = "d:/images/"; + public static final String FOLDER_TO_DOWNLOAD = "d:/temp/images/"; public static void main(String[] args) throws IOException { ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD); + + long time = System.currentTimeMillis(); + imageCrawler.downloadImages("http://trinixy.ru/16356-prikolnye_kartinki_ochen_mnogo.html"); + System.out.println("Time spent ms " + (System.currentTimeMillis() - time)); + System.out.println("While it's loading you can enter another url to start download images:"); Scanner scanner = new Scanner(System.in); From 9c732db06dfcfa9e02ffa6b7ee9c2cc736a41b8e Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 03:56:47 +0200 Subject: [PATCH 5/9] change folder --- {org => src/org}/geekhub/Page.java | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {org => src/org}/geekhub/Page.java (100%) diff --git a/org/geekhub/Page.java b/src/org/geekhub/Page.java similarity index 100% rename from org/geekhub/Page.java rename to src/org/geekhub/Page.java From de5a65c95d806b6b9b57f145e3e533aa4e89ec49 Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 04:26:38 +0200 Subject: [PATCH 6/9] Enhanced method run() in ImageTask class --- src/org/geekhub/ImageTask.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/org/geekhub/ImageTask.java b/src/org/geekhub/ImageTask.java index 03f0aea..efcaf1e 100644 --- a/src/org/geekhub/ImageTask.java +++ b/src/org/geekhub/ImageTask.java @@ -1,7 +1,8 @@ package org.geekhub; -import java.io.*; +import java.io.IOException; import java.net.URL; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -11,8 +12,6 @@ */ public class ImageTask implements Runnable { - public static final int BUFFER_SIZE = 8 * 1024; - private URL url; private String folder; @@ -27,14 +26,8 @@ public ImageTask(URL url, String folder) { @Override public void run() { Path path = Paths.get(folder, buildFileName(url)); - try (BufferedInputStream inputStream = new BufferedInputStream(url.openStream(), BUFFER_SIZE); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - FileOutputStream file = new FileOutputStream(path.toFile())){ - int bytes; - while((bytes = inputStream.read()) >= 0) { - outputStream.write(bytes); - } - outputStream.writeTo(file); + try { + Files.write(path, ConnectionUtils.getData(url)); } catch (IOException e) { e.printStackTrace(); } @@ -45,4 +38,4 @@ public void run() { private String buildFileName(URL url) { return url.toString().replaceAll("[^a-zA-Z0-9-_\\.]", "_"); } -} +} \ No newline at end of file From 6c645a260c6f95a26b36c7016f092f5b3871a471 Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 15:37:01 +0200 Subject: [PATCH 7/9] Updated method getData in ConnectionUtils class --- src/org/geekhub/ConnectionUtils.java | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/org/geekhub/ConnectionUtils.java b/src/org/geekhub/ConnectionUtils.java index b46cb9d..65b647a 100644 --- a/src/org/geekhub/ConnectionUtils.java +++ b/src/org/geekhub/ConnectionUtils.java @@ -1,8 +1,8 @@ package org.geekhub; -import java.io.BufferedReader; +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.net.URL; /** @@ -21,17 +21,18 @@ public class ConnectionUtils { * @throws IOException */ public static byte[] getData(URL url) throws IOException { - byte[] bytes; - StringBuilder content = new StringBuilder(); - try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()), BUFFER_SIZE)){ - String line; + byte[] data; + try (BufferedInputStream inputStream = new BufferedInputStream(url.openStream(), BUFFER_SIZE); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(BUFFER_SIZE)) { + byte[] buffer = new byte[BUFFER_SIZE]; + int num; int size = 0; - while ((line = reader.readLine()) != null && size < MAX_CONTENT_SIZE) { - content.append(line); - size += line.length(); + while ((num = inputStream.read(buffer)) != -1 && size < MAX_CONTENT_SIZE) { + outputStream.write(buffer, 0, num); + size += num; } - bytes = content.toString().getBytes(); + data = outputStream.toByteArray(); } - return bytes; + return data; } } From ea71a05a51fcdac566511dec4640ad4f119a3660 Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 15:40:09 +0200 Subject: [PATCH 8/9] Updated imageLinkPattern, added DOTALL param --- src/org/geekhub/Page.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/org/geekhub/Page.java b/src/org/geekhub/Page.java index f9915de..7bc31c0 100644 --- a/src/org/geekhub/Page.java +++ b/src/org/geekhub/Page.java @@ -3,7 +3,9 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -13,14 +15,14 @@ */ public class Page { Pattern linkPattern = Pattern.compile("]*?\\s)*?href=\"(.*?)\".*?>"); - Pattern imageLinkPattern = Pattern.compile("|)"); + Pattern imageLinkPattern = Pattern.compile("|)", Pattern.DOTALL); private String content; private URL url; /** * Be careful, constructor downloads content, it could be slow. - * @param url + * @param url to page with links * @throws IOException */ public Page(URL url) throws IOException { From c48bba52a05e3acc5e88c1986604d24a7057cb0f Mon Sep 17 00:00:00 2001 From: Sergiy Govorukhin Date: Wed, 20 Jan 2016 16:33:09 +0200 Subject: [PATCH 9/9] Added awaitTermination method to ImageCrawler and test to main class for measuring time --- src/org/geekhub/ImageCrawler.java | 17 ++++++++++++++--- src/org/geekhub/Main.java | 21 ++++++++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/org/geekhub/ImageCrawler.java b/src/org/geekhub/ImageCrawler.java index e2e01a2..252216a 100644 --- a/src/org/geekhub/ImageCrawler.java +++ b/src/org/geekhub/ImageCrawler.java @@ -4,9 +4,11 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * ImageCrawler downloads all images to specified folder from specified resource. @@ -18,7 +20,7 @@ public class ImageCrawler { private List imageExtensions = Arrays.asList("jpg", "jpeg", "bmp", "gif", "png", "tiff", "tif"); //number of threads to download images simultaneously - public static final int NUMBER_OF_THREADS = 10; + public static final int NUMBER_OF_THREADS = 15; private ExecutorService executorService = Executors.newFixedThreadPool(NUMBER_OF_THREADS); private String folder; @@ -33,7 +35,8 @@ public ImageCrawler(String folder) throws MalformedURLException { * @throws IOException */ public void downloadImages(String urlToPage) throws IOException { - new Page(new URL(urlToPage)).getImageLinks() + Collection urls = new Page(new URL(urlToPage)).getImageLinks(); + urls .stream() .filter(this::isImageURL) .forEach(link -> executorService.execute(new ImageTask(link, folder))); @@ -46,6 +49,14 @@ public void stop() { executorService.shutdown(); } + /** + * Method waits while all tasks have finished + */ + public void awaitTermination() throws InterruptedException { + executorService.shutdown(); + executorService.awaitTermination(1, TimeUnit.MINUTES); + } + //detects is current url is an image. Checking for popular extensions should be enough private boolean isImageURL(URL url) { String path = url.getFile(); @@ -58,6 +69,6 @@ private boolean isImageURL(URL url) { } return imageExtensions .stream() - .anyMatch(item -> item.equals(ext)); + .anyMatch(item -> item.equalsIgnoreCase(ext)); } } diff --git a/src/org/geekhub/Main.java b/src/org/geekhub/Main.java index 8200722..7ddb9ac 100644 --- a/src/org/geekhub/Main.java +++ b/src/org/geekhub/Main.java @@ -1,6 +1,7 @@ package org.geekhub; import java.io.IOException; +import java.net.URL; import java.util.Scanner; public class Main { @@ -9,13 +10,8 @@ public class Main { public static void main(String[] args) throws IOException { ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD); - - long time = System.currentTimeMillis(); - imageCrawler.downloadImages("http://trinixy.ru/16356-prikolnye_kartinki_ochen_mnogo.html"); - System.out.println("Time spent ms " + (System.currentTimeMillis() - time)); - System.out.println("While it's loading you can enter another url to start download images:"); Scanner scanner = new Scanner(System.in); @@ -24,6 +20,21 @@ public static void main(String[] args) throws IOException { imageCrawler.downloadImages(command); System.out.println("...and another url:"); } + imageCrawler.stop(); } + + public void test(URL url) throws IOException { + long time = System.currentTimeMillis(); + + ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD); + imageCrawler.downloadImages(url.toString()); + + try { + imageCrawler.awaitTermination(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("Time spent ms " + (System.currentTimeMillis() - time)); + } }