diff --git a/.gitignore b/.gitignore index 855e8a9..b066d45 100644 --- a/.gitignore +++ b/.gitignore @@ -547,8 +547,7 @@ MigrationBackup/ # Ionide (cross platform F# VS Code tools) working folder .ionide/ -# Ignore .FIT & .GPX files -*.fit -*.gpx +# Ignore testing files +files/* # End of https://www.toptal.com/developers/gitignore/api/linux,macos,windows,jetbrains+all,go,visualstudio,visualstudiocode diff --git a/decode/decoding.go b/decode/decoding.go index 0daa1a7..f76ec21 100644 --- a/decode/decoding.go +++ b/decode/decoding.go @@ -1,15 +1,21 @@ package decode import ( + "bufio" "bytes" + "crypto/sha512" + "encoding/hex" "fmt" "io/ioutil" "math" "os" "strconv" + "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/bikedataproject/go-bike-data-lib/dbmodel" + "github.com/google/uuid" geo "github.com/paulmach/go.geo" "github.com/tkrajina/gpxgo/gpx" "github.com/tormoder/fit" @@ -154,3 +160,51 @@ func GpxToContribution(filedir string) (contrib dbmodel.Contribution, err error) return } + +// GetUserFromHTML : extract a user object from an HTML-file +func GetUserFromHTML(filepath string, usr *dbmodel.User) (err error) { + // Set global user data + usr.UserIdentifier = uuid.New().String() + usr.ExpiresAt = -1 + usr.ExpiresIn = -1 + usr.IsHistoryFetched = true + usr.Provider = "web/LocationHistory" + usr.TokenCreationDate = time.Now() + usr.AccessToken = "0" + usr.RefreshToken = "0" + + // Open HTML file + file, err := os.Open(filepath) + if err != nil { + return + } + + // Create a buffer reader from the file + reader := bufio.NewReader(file) + + // Create goquery documentreader + doc, err := goquery.NewDocumentFromReader(reader) + if err != nil { + return + } + + // Find e-mail address in document + // Find the header element first + doc.Find(".header_title").Each(func(i int, s *goquery.Selection) { + // Split the value of this element by spaces + pageTitle := strings.Split(s.Text(), " ") + // Loop over each word + for _, word := range pageTitle { + // Find the e-mail address + if strings.Contains(word, "@") { + // Hash the e-mail + hasher := sha512.New() + hasher.Write([]byte(word)) + usr.ProviderUser = hex.EncodeToString(hasher.Sum(nil)) + break + } + } + }) + + return +} diff --git a/filehandler.go b/fit-file-handler.go similarity index 98% rename from filehandler.go rename to fit-file-handler.go index ec437e6..d67be21 100644 --- a/filehandler.go +++ b/fit-file-handler.go @@ -3,7 +3,6 @@ package main import ( "fmt" "go-file-processing-daemon/decode" - "os" "time" "github.com/bikedataproject/go-bike-data-lib/dbmodel" @@ -50,7 +49,6 @@ func HandleFitFile(file string) error { return fmt.Errorf("Could not create contribution: %v", err) } log.Infof("Added contribution for user %v", userID) - os.Remove(file) return nil } @@ -87,6 +85,5 @@ func HandleGpxFile(file string) error { return fmt.Errorf("Could not create contribution: %v", err) } log.Infof("Added contribution for user %v", user.ID) - os.Remove(file) return nil } diff --git a/go.mod b/go.mod index fca5f74..f0e789e 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,8 @@ module go-file-processing-daemon go 1.13 require ( - github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200727162450-a47d3b297b9b + github.com/PuerkitoBio/goquery v1.5.1 + github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200728150720-09b74d41943c github.com/fatih/camelcase v1.0.0 // indirect github.com/fatih/structs v1.1.0 // indirect github.com/google/uuid v1.1.1 diff --git a/go.sum b/go.sum index 3c8541a..451a5a8 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,12 @@ github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/OneOfOne/xxhash v1.2.5 h1:zl/OfRA6nftbBK9qTohYBJ5xvw6C/oNKizR7cZGl3cI= github.com/OneOfOne/xxhash v1.2.5/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q= -github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200727162450-a47d3b297b9b h1:g+zqEaYpgJKUBd2fhtzcGi8PqOqnhA7m2oibnWDkhSg= -github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200727162450-a47d3b297b9b/go.mod h1:puaYhkBYtfO+uSfgHater2N6t4BAeGnNqmGs0G1rifM= +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200728150720-09b74d41943c h1:43jHCoAqjXn/iAU6piEy++6HF3hT7juI60v+q8f6DXg= +github.com/bikedataproject/go-bike-data-lib/dbmodel v0.0.0-20200728150720-09b74d41943c/go.mod h1:puaYhkBYtfO+uSfgHater2N6t4BAeGnNqmGs0G1rifM= github.com/bradfitz/latlong v0.0.0-20170410180902-f3db6d0dff40/go.mod h1:ZcXX9BndVQx6Q/JM6B8x7dLE9sl20S+TQsv4KO7tEQk= github.com/cespare/xxhash v1.0.0 h1:naDmySfoNg0nKS62/ujM6e71ZgM2AoVdaqGwMG0w18A= github.com/cespare/xxhash v1.0.0/go.mod h1:fX/lfQBkSCDXZSUgv6jVIu/EVA3/JNseAX5asI4c4T4= @@ -61,7 +65,11 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/image v0.0.0-20190501045829-6d32002ffd75/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/lint v0.0.0-20190409202823-959b441ac422 h1:QzoH/1pFpZguR8NrRHLcO6jKqfv2zpuSqZLgdm7ZmjI= golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc= diff --git a/location-history-handler.go b/location-history-handler.go new file mode 100644 index 0000000..c45cb61 --- /dev/null +++ b/location-history-handler.go @@ -0,0 +1,296 @@ +package main + +import ( + "archive/zip" + "bufio" + "crypto/sha1" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/bikedataproject/go-bike-data-lib/dbmodel" + "github.com/google/uuid" + geo "github.com/paulmach/go.geo" + log "github.com/sirupsen/logrus" +) + +// PointActivity : Single activity information object +type PointActivity struct { + Type string `json:"type"` + Confidence int `json:"confidence"` +} + +// PointActivities : Collection of activities +type PointActivities struct { + TimestampMs string `json:"timestampMs"` + Activity []PointActivity `json:"activity"` +} + +// LocationHistoryPoint : Single location datapoint +type LocationHistoryPoint struct { + TimestampMs string `json:"timestampMs"` + LatitudeE7 float64 `json:"latitudeE7"` + LongitudeE7 float64 `json:"longitudeE7"` + Accuracy int `json:"accuracy"` + Activity []PointActivities `json:"activity,omitempty"` +} + +// LocationHistory : Collection of LocationHistoryPoints +type LocationHistory struct { + Locations []LocationHistoryPoint `json:"locations"` +} + +// HandleLocationFile : Parse a given JSON file and process it's contents +func HandleLocationFile(filepath string, user dbmodel.User) error { + // Attempt to read the file + data, err := ioutil.ReadFile(filepath) + if err != nil { + return err + } + + // Unmarshal file + var history LocationHistory + if err = json.Unmarshal(data, &history); err != nil { + return fmt.Errorf("Could not unmarshall data into location history: %v", err) + } + + if len(history.Locations) > 0 { + // Convert history to trip-based objects + trips := make(map[string][]LocationHistoryPoint) + + // Loop over each individual point & organise per day + for _, point := range history.Locations { + unixMs, err := strconv.ParseInt(point.TimestampMs, 10, 64) + if err != nil { + return err + } + timestamp := time.Unix(unixMs/1000, 0) + + // Loop over the activities for each point + for _, actCollection := range point.Activity { + for _, act := range actCollection.Activity { + if act.Type == LocationHistoryCylcingType && act.Confidence >= LocationHistoryActivityThreshold { + // Set trip + trips[timestamp.Format("2006-01-02")] = append(trips[timestamp.Format("2006-01-02")], point) + break + } + } + } + } + + // Convert map to Contributions + contributions, err := tripsToContributions(trips) + if err != nil { + return err + } + + // Upload data to database + for _, contribution := range contributions { + if err := db.AddContribution(&contribution, &user); err != nil { + log.Warnf("Could not add contribution to database: %v", err) + } else { + log.Info("Add location history trip to database") + } + } + + } else { + return fmt.Errorf("%v is not a location history file or is empty", filepath) + } + + // Clean return + return nil +} + +// UnpackLocationFiles : Unzip a given .ZIP file's contents +func UnpackLocationFiles(filepath string, extractPath string) (locationfiles []string, foldername string, err error) { + // Unzip & get all filenames + foldername = fmt.Sprintf("%v/%v", extractPath, uuid.New()) + files, err := unzip(filepath, foldername) + if err != nil { + return + } + + // Search for the location history files + for _, file := range files { + if strings.Contains(file, ".json") || strings.Contains(file, ".html") { + locationfiles = append(locationfiles, file) + } + } + return +} + +// tripsToContributions : Convert location history trips to bikedataproject Contributions +func tripsToContributions(trips map[string][]LocationHistoryPoint) (contributions []dbmodel.Contribution, err error) { + for _, trip := range trips { + // Check if trip contains more points then the threshold + if len(trip) >= LocationHistoryPointThreshold { + // Create geopath from points + geoPath := geo.NewPath() + var timestamps []time.Time + + for _, point := range trip { + // Add geopoint to path + geoPath.Push(geo.NewPoint(point.LongitudeE7/1e7, point.LatitudeE7/1e7)) + + // Get point timestamp + unixMs, err := strconv.ParseInt(point.TimestampMs, 10, 64) + if err != nil { + return contributions, err + } + // Convert to UNIX timestamp + ts := time.Unix(unixMs/1000, 0) + timestamps = append(timestamps, ts) + } + + // Create contribution + contrib := dbmodel.Contribution{ + UserAgent: "web/LocationHistory", + TimeStampStart: getStartTimestamp(trip), + TimeStampStop: getEndTimestamp(trip), + Distance: int(geoPath.GeoDistance()), + Duration: int(getEndTimestamp(trip).Sub(getEndTimestamp(trip)).Seconds()), + PointsGeom: geoPath, + PointsTime: timestamps, + } + + // Add contribution to array + contributions = append(contributions, contrib) + } + } + return +} + +// getStartTimestamp : get the lowest timestamp of an array of LocationHistoryPoints +func getStartTimestamp(points []LocationHistoryPoint) (timestamp time.Time) { + // Set timestamp to now + timestamp = time.Now() + + // Loop over trip points + for _, p := range points { + if tmpTimestamp, err := getTimestamp(p); err == nil { + // Check if timestamp is earlier + if diff := timestamp.Sub(tmpTimestamp); diff > 0 { + timestamp = tmpTimestamp + } + } + } + return +} + +// getStartTimestamp : get the highest timestamp of an array of LocationHistoryPoints +func getEndTimestamp(points []LocationHistoryPoint) (timestamp time.Time) { + // Loop over trip points + for _, p := range points { + if tmpTimestamp, err := getTimestamp(p); err == nil { + // Check if timestamp is earlier + if diff := timestamp.Sub(tmpTimestamp); diff < 0 { + timestamp = tmpTimestamp + } + } + } + return +} + +// getTimestamp : Get the timestamp of a single LocationHistoryPoint +func getTimestamp(point LocationHistoryPoint) (timestamp time.Time, err error) { + unixMs, err := strconv.ParseInt(point.TimestampMs, 10, 64) + if err != nil { + return + } + + // Convert to UNIX timestamp + timestamp = time.Unix(unixMs/1000, 0) + return +} + +// getUserProvider : Read HTML-file to fetch provider user +func getProviderUser(filepath string) (id string, err error) { + // Read file + file, err := os.Open(filepath) + if err != nil { + return + } + + // Convert to bytesreader + reader := bufio.NewReader(file) + + // Convert to Goquery object + doc, err := goquery.NewDocumentFromReader(reader) + + // Find page title & Loop over results - should be just 1 + doc.Find(".header_title").Each(func(i int, s *goquery.Selection) { + // Split sentence in words + pageTitle := strings.Split(s.Text(), " ") + for _, word := range pageTitle { + // Extract e-mail address + if strings.Contains(word, "@") { + // Hash with SHA1 + hasher := sha1.New() + hasher.Write([]byte(word)) + id = hex.EncodeToString(hasher.Sum(nil)) + break + } + } + }) + + return +} + +// unzip : unzip a given .zip file and return the filenames of the contents +func unzip(source string, destination string) (result []string, err error) { + var filenames []string + + reader, err := zip.OpenReader(source) + if err != nil { + return filenames, err + } + defer reader.Close() + + for _, f := range reader.File { + // Store filename/path for returning and using later on + path := filepath.Join(destination, f.Name) + + // Add filename to result + result = append(result, path) + + if f.FileInfo().IsDir() { + // Make Folder + os.MkdirAll(path, os.ModePerm) + continue + } + + // Copy file & contents + if err = os.MkdirAll(filepath.Dir(path), os.ModePerm); err != nil { + return result, err + } + + outFile, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) + if err != nil { + return result, err + } + + rc, err := f.Open() + if err != nil { + return result, err + } + + _, err = io.Copy(outFile, rc) + + // Close the file without defer to close before next iteration of loop + outFile.Close() + rc.Close() + + if err != nil { + return filenames, err + } + } + return +} diff --git a/main.go b/main.go index bdeb6bd..95d1c95 100644 --- a/main.go +++ b/main.go @@ -1,12 +1,13 @@ package main import ( - "fmt" "go-file-processing-daemon/config" "go-file-processing-daemon/crawl" + "go-file-processing-daemon/decode" "io/ioutil" "os" "strconv" + "strings" "time" "github.com/bikedataproject/go-bike-data-lib/dbmodel" @@ -16,6 +17,15 @@ import ( var db dbmodel.Database +const ( + // LocationHistoryActivityThreshold : Threshold to validate the activity confidence against + LocationHistoryActivityThreshold = 40 + // LocationHistoryCylcingType : Type of activity which matches bike riding + LocationHistoryCylcingType = "ON_BICYCLE" + // LocationHistoryPointThreshold : Threshold of minimal data points + LocationHistoryPointThreshold = 20 +) + // ReadSecret : Read a file and return it's content as string - used for Docker secrets func ReadSecret(file string) string { data, err := ioutil.ReadFile(file) @@ -27,14 +37,14 @@ func ReadSecret(file string) string { func main() { // Set filetypes - FileTypes := [2]string{"fit", "gpx"} + FileTypes := []string{"fit", "gpx", "zip"} // Set logging to file - logfile, err := os.OpenFile(fmt.Sprintf("log/%v.log", time.Now().Unix()), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) + /*logfile, err := os.OpenFile(fmt.Sprintf("log/%v.log", time.Now().Unix()), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Fatalf("Could not create logfile: %v", err) } - log.SetOutput(logfile) + log.SetOutput(logfile)*/ // Load configuration values conf := &config.Config{} @@ -91,10 +101,56 @@ func main() { log.Errorf("Something went wrong handling a GPX file: %v", err) } break + case "zip": + // Generate user object + var user dbmodel.User + + // Attempt to unzip the file + if locationfiles, _, err := UnpackLocationFiles(file, conf.FileDir); err != nil { + log.Errorf("Could not unzip %v: %v", file, err) + } else { + // Search the HTML-file to build a user account + for _, file := range locationfiles { + if strings.Contains(file, ".html") { + err = decode.GetUserFromHTML(file, &user) + if err != nil { + // Make blank user object + log.Errorf("Could not extract user from HTML file: %v", err) + } else { + // Check if user exists + userTmp, err := db.GetUserData(user.ProviderUser) + if err != nil { + log.Infof("Could not fetch user data: %v", err) + } + // Check if userdata is empty + if userTmp.ID == "" { + // Add user to database + user, err = db.AddUser(&user) + if err != nil { + log.Errorf("Could not add new user to database: %v", err) + } else { + log.Info("Created new user from HTML file") + } + } else { + user = userTmp + } + } + } + } + + // Handle the ZIP file contents which are .json files + for _, locationfile := range locationfiles { + if err := HandleLocationFile(locationfile, user); err != nil { + log.Warnf("Could not handle location file: %v", err) + } + } + } + break default: log.Warnf("Trying to handle a file which is not in filetypes? (%v)", file) break } + os.Remove(file) } } }