diff --git a/org/geekhub/ConnectionUtils.java b/org/geekhub/ConnectionUtils.java deleted file mode 100644 index f67536f..0000000 --- a/org/geekhub/ConnectionUtils.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.geekhub; - -import java.io.IOException; -import java.net.URL; - -/** - * Utils class that contains useful method to interact with URLConnection - */ -public class ConnectionUtils { - - /** - * Downloads content for specified URL and returns it as a byte array. - * Should be used for small files only. Don't use it to download big files it's dangerous. - * @param url - * @return - * @throws IOException - */ - public static byte[] getData(URL url) throws IOException { - //implement me - return null; - } -} diff --git a/src/org/geekhub/ConnectionUtils.java b/src/org/geekhub/ConnectionUtils.java new file mode 100644 index 0000000..65b647a --- /dev/null +++ b/src/org/geekhub/ConnectionUtils.java @@ -0,0 +1,38 @@ +package org.geekhub; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URL; + +/** + * Utils class that contains useful method to interact with URLConnection + */ +public class ConnectionUtils { + + public static final int BUFFER_SIZE = 8 * 1024; + public static final int MAX_CONTENT_SIZE = 1024 * 1024; + + /** + * Downloads content for specified URL and returns it as a byte array. + * Should be used for small files only. Don't use it to download big files it's dangerous. + * @param url to download content from + * @return a byte array + * @throws IOException + */ + public static byte[] getData(URL url) throws IOException { + byte[] data; + try (BufferedInputStream inputStream = new BufferedInputStream(url.openStream(), BUFFER_SIZE); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(BUFFER_SIZE)) { + byte[] buffer = new byte[BUFFER_SIZE]; + int num; + int size = 0; + while ((num = inputStream.read(buffer)) != -1 && size < MAX_CONTENT_SIZE) { + outputStream.write(buffer, 0, num); + size += num; + } + data = outputStream.toByteArray(); + } + return data; + } +} diff --git a/org/geekhub/ImageCrawler.java b/src/org/geekhub/ImageCrawler.java similarity index 52% rename from org/geekhub/ImageCrawler.java rename to src/org/geekhub/ImageCrawler.java index 8cad33b..252216a 100644 --- a/org/geekhub/ImageCrawler.java +++ b/src/org/geekhub/ImageCrawler.java @@ -1,10 +1,14 @@ package org.geekhub; -import java.io.*; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * ImageCrawler downloads all images to specified folder from specified resource. @@ -13,8 +17,10 @@ */ public class ImageCrawler { + private List imageExtensions = Arrays.asList("jpg", "jpeg", "bmp", "gif", "png", "tiff", "tif"); + //number of threads to download images simultaneously - public static final int NUMBER_OF_THREADS = 10; + public static final int NUMBER_OF_THREADS = 15; private ExecutorService executorService = Executors.newFixedThreadPool(NUMBER_OF_THREADS); private String folder; @@ -29,7 +35,11 @@ public ImageCrawler(String folder) throws MalformedURLException { * @throws IOException */ public void downloadImages(String urlToPage) throws IOException { - //implement me + Collection urls = new Page(new URL(urlToPage)).getImageLinks(); + urls + .stream() + .filter(this::isImageURL) + .forEach(link -> executorService.execute(new ImageTask(link, folder))); } /** @@ -39,12 +49,26 @@ public void stop() { executorService.shutdown(); } + /** + * Method waits while all tasks have finished + */ + public void awaitTermination() throws InterruptedException { + executorService.shutdown(); + executorService.awaitTermination(1, TimeUnit.MINUTES); + } + //detects is current url is an image. Checking for popular extensions should be enough private boolean isImageURL(URL url) { - //implement me - return false; + String path = url.getFile(); + int index = path.lastIndexOf("."); + String ext; + if (index > 0) { + ext = path.substring(index + 1); + } else { + return false; + } + return imageExtensions + .stream() + .anyMatch(item -> item.equalsIgnoreCase(ext)); } - - - } diff --git a/org/geekhub/ImageTask.java b/src/org/geekhub/ImageTask.java similarity index 69% rename from org/geekhub/ImageTask.java rename to src/org/geekhub/ImageTask.java index de0a340..efcaf1e 100644 --- a/org/geekhub/ImageTask.java +++ b/src/org/geekhub/ImageTask.java @@ -1,12 +1,17 @@ package org.geekhub; +import java.io.IOException; import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; /** * Represents worker that downloads image from URL to specified folder.
* Name of the image will be constructed based on URL. Names for the same URL will be the same. */ public class ImageTask implements Runnable { + private URL url; private String folder; @@ -20,11 +25,17 @@ public ImageTask(URL url, String folder) { */ @Override public void run() { - //implement me + Path path = Paths.get(folder, buildFileName(url)); + try { + Files.write(path, ConnectionUtils.getData(url)); + } catch (IOException e) { + e.printStackTrace(); + } + } //converts URL to unique file name private String buildFileName(URL url) { return url.toString().replaceAll("[^a-zA-Z0-9-_\\.]", "_"); } -} +} \ No newline at end of file diff --git a/org/geekhub/Main.java b/src/org/geekhub/Main.java similarity index 56% rename from org/geekhub/Main.java rename to src/org/geekhub/Main.java index 7cff652..7ddb9ac 100644 --- a/org/geekhub/Main.java +++ b/src/org/geekhub/Main.java @@ -1,11 +1,12 @@ package org.geekhub; import java.io.IOException; +import java.net.URL; import java.util.Scanner; public class Main { - public static final String FOLDER_TO_DOWNLOAD = "d:/images/"; + public static final String FOLDER_TO_DOWNLOAD = "d:/temp/images/"; public static void main(String[] args) throws IOException { ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD); @@ -19,6 +20,21 @@ public static void main(String[] args) throws IOException { imageCrawler.downloadImages(command); System.out.println("...and another url:"); } + imageCrawler.stop(); } + + public void test(URL url) throws IOException { + long time = System.currentTimeMillis(); + + ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD); + imageCrawler.downloadImages(url.toString()); + + try { + imageCrawler.awaitTermination(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + System.out.println("Time spent ms " + (System.currentTimeMillis() - time)); + } } diff --git a/org/geekhub/Page.java b/src/org/geekhub/Page.java similarity index 92% rename from org/geekhub/Page.java rename to src/org/geekhub/Page.java index f9915de..7bc31c0 100644 --- a/org/geekhub/Page.java +++ b/src/org/geekhub/Page.java @@ -3,7 +3,9 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -13,14 +15,14 @@ */ public class Page { Pattern linkPattern = Pattern.compile("]*?\\s)*?href=\"(.*?)\".*?>"); - Pattern imageLinkPattern = Pattern.compile("|)"); + Pattern imageLinkPattern = Pattern.compile("|)", Pattern.DOTALL); private String content; private URL url; /** * Be careful, constructor downloads content, it could be slow. - * @param url + * @param url to page with links * @throws IOException */ public Page(URL url) throws IOException {