-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathProgram.cs
More file actions
111 lines (98 loc) · 3.5 KB
/
Program.cs
File metadata and controls
111 lines (98 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Runtime.Remoting.Messaging;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
namespace PDFIndex
{
internal class Program
{
private static void Main(string[] args)
{
System.IO.FileInfo fi = null;
try
{
fi = new System.IO.FileInfo(args[0]);
}
catch (ArgumentException)
{
}
catch (System.IO.PathTooLongException)
{
}
catch (NotSupportedException)
{
}
if (ReferenceEquals(fi, null))
{
Console.WriteLine("Please provide a valid file path.");
return;
}
string filename = args[0];
Console.WriteLine("Building word index for \n{0} ...", filename);
SortedDictionary<string, List<int>> wordIndex = BuildIndexFromPdf(filename);
var file = new System.IO.StreamWriter("word_index.txt");
foreach (var word in wordIndex)
{
string values = null;
word.Value.ForEach(v => values += v + ", ");
file.WriteLine("{0}\t{1}", word.Key, values);
}
Console.WriteLine("Completed! nb of words: {0}", wordIndex.Count);
}
private static SortedDictionary<string, List<int>> BuildIndexFromPdf(string path)
{
var wordIndex = new SortedDictionary<string, List<int>>();
using (PdfReader reader = new PdfReader(path))
{
for (int page = 1; page <= reader.NumberOfPages; page++)
{
string pageText = null;
try
{
pageText = PdfTextExtractor.GetTextFromPage(reader, page);
}
catch (InlineImageUtils.InlineImageParseException e)
{
Console.WriteLine("Exception! {0}. ", e.Message);
Console.WriteLine("Skipping page {0} ...", page);
continue;
}
var sb = new StringBuilder();
char[] arr = pageText.ToCharArray();
foreach (char c in arr)
{
if ((char.IsLetterOrDigit(c) || char.IsWhiteSpace(c) || c == '-'))
{
sb.Append(c);
}
else
{
sb.Append(' ');
}
}
pageText = sb.ToString();
string[] words = pageText.Split(new []{' ', '\n'}, StringSplitOptions.RemoveEmptyEntries);
int n;
foreach (string word in words.Where(w => w.Length > 2 && !int.TryParse(w, out n)))
{
string w = word.ToLower();
if (!wordIndex.ContainsKey(w))
{
wordIndex[w] = new List<int>();
}
if (!wordIndex[w].Contains(page))
{
wordIndex[w].Add(page);
}
}
}
}
return wordIndex;
}
}
}