Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions org/geekhub/ConnectionUtils.java

This file was deleted.

38 changes: 38 additions & 0 deletions src/org/geekhub/ConnectionUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.geekhub;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URL;

/**
* Utils class that contains useful method to interact with URLConnection
*/
public class ConnectionUtils {

public static final int BUFFER_SIZE = 8 * 1024;
public static final int MAX_CONTENT_SIZE = 1024 * 1024;

/**
* Downloads content for specified URL and returns it as a byte array.
* Should be used for small files only. Don't use it to download big files it's dangerous.
* @param url to download content from
* @return a byte array
* @throws IOException
*/
public static byte[] getData(URL url) throws IOException {
byte[] data;
try (BufferedInputStream inputStream = new BufferedInputStream(url.openStream(), BUFFER_SIZE);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(BUFFER_SIZE)) {
byte[] buffer = new byte[BUFFER_SIZE];
int num;
int size = 0;
while ((num = inputStream.read(buffer)) != -1 && size < MAX_CONTENT_SIZE) {
outputStream.write(buffer, 0, num);
size += num;
}
data = outputStream.toByteArray();
}
return data;
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
package org.geekhub;

import java.io.*;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
* ImageCrawler downloads all images to specified folder from specified resource.
Expand All @@ -13,8 +17,10 @@
*/
public class ImageCrawler {

private List<String> imageExtensions = Arrays.asList("jpg", "jpeg", "bmp", "gif", "png", "tiff", "tif");

//number of threads to download images simultaneously
public static final int NUMBER_OF_THREADS = 10;
public static final int NUMBER_OF_THREADS = 15;

private ExecutorService executorService = Executors.newFixedThreadPool(NUMBER_OF_THREADS);
private String folder;
Expand All @@ -29,7 +35,11 @@ public ImageCrawler(String folder) throws MalformedURLException {
* @throws IOException
*/
public void downloadImages(String urlToPage) throws IOException {
//implement me
Collection<URL> urls = new Page(new URL(urlToPage)).getImageLinks();
urls
.stream()
.filter(this::isImageURL)
.forEach(link -> executorService.execute(new ImageTask(link, folder)));
}

/**
Expand All @@ -39,12 +49,26 @@ public void stop() {
executorService.shutdown();
}

/**
* Method waits while all tasks have finished
*/
public void awaitTermination() throws InterruptedException {
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.MINUTES);
}

//detects is current url is an image. Checking for popular extensions should be enough
private boolean isImageURL(URL url) {
//implement me
return false;
String path = url.getFile();
int index = path.lastIndexOf(".");
String ext;
if (index > 0) {
ext = path.substring(index + 1);
} else {
return false;
}
return imageExtensions
.stream()
.anyMatch(item -> item.equalsIgnoreCase(ext));
}



}
15 changes: 13 additions & 2 deletions org/geekhub/ImageTask.java → src/org/geekhub/ImageTask.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
package org.geekhub;

import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* Represents worker that downloads image from URL to specified folder.<br/>
* Name of the image will be constructed based on URL. Names for the same URL will be the same.
*/
public class ImageTask implements Runnable {

private URL url;
private String folder;

Expand All @@ -20,11 +25,17 @@ public ImageTask(URL url, String folder) {
*/
@Override
public void run() {
//implement me
Path path = Paths.get(folder, buildFileName(url));
try {
Files.write(path, ConnectionUtils.getData(url));
} catch (IOException e) {
e.printStackTrace();
}

}

//converts URL to unique file name
private String buildFileName(URL url) {
return url.toString().replaceAll("[^a-zA-Z0-9-_\\.]", "_");
}
}
}
18 changes: 17 additions & 1 deletion org/geekhub/Main.java → src/org/geekhub/Main.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package org.geekhub;

import java.io.IOException;
import java.net.URL;
import java.util.Scanner;

public class Main {

public static final String FOLDER_TO_DOWNLOAD = "d:/images/";
public static final String FOLDER_TO_DOWNLOAD = "d:/temp/images/";

public static void main(String[] args) throws IOException {
ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD);
Expand All @@ -19,6 +20,21 @@ public static void main(String[] args) throws IOException {
imageCrawler.downloadImages(command);
System.out.println("...and another url:");
}

imageCrawler.stop();
}

public void test(URL url) throws IOException {
long time = System.currentTimeMillis();

ImageCrawler imageCrawler = new ImageCrawler(FOLDER_TO_DOWNLOAD);
imageCrawler.downloadImages(url.toString());

try {
imageCrawler.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("Time spent ms " + (System.currentTimeMillis() - time));
}
}
8 changes: 5 additions & 3 deletions org/geekhub/Page.java → src/org/geekhub/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand All @@ -13,14 +15,14 @@
*/
public class Page {
Pattern linkPattern = Pattern.compile("<a\\s(?:[^\\s>]*?\\s)*?href=\"(.*?)\".*?>");
Pattern imageLinkPattern = Pattern.compile("<img.*?src=\"(.*?)\".*?(/>|</img>)");
Pattern imageLinkPattern = Pattern.compile("<img.*?src=\"(.*?)\".*?(/>|</img>)", Pattern.DOTALL);

private String content;
private URL url;

/**
* Be careful, constructor downloads content, it could be slow.
* @param url
* @param url to page with links
* @throws IOException
*/
public Page(URL url) throws IOException {
Expand Down