stats-import: Custom edit to import only word frequency

This commit is contained in:
Hamcha 2016-02-14 10:33:28 +01:00
parent 7211de403a
commit 942651dacb

View file

@ -7,8 +7,7 @@ import (
"flag" "flag"
"log" "log"
"os" "os"
"strconv" "strings"
"time"
"github.com/boltdb/bolt" "github.com/boltdb/bolt"
) )
@ -27,6 +26,8 @@ type Message struct {
Date int `json:"date"` Date int `json:"date"`
} }
type UserCount map[string]uint64
type Stats struct { type Stats struct {
Total uint64 Total uint64
ByUser map[string]uint64 ByUser map[string]uint64
@ -36,6 +37,7 @@ type Stats struct {
Replies uint64 Replies uint64
Forward uint64 Forward uint64
Username map[string]string Username map[string]string
Words map[string]UserCount
} }
func assert(err error) { func assert(err error) {
@ -84,35 +86,63 @@ func main() {
} }
func processMessage(msg Message, data *Stats) { func processMessage(msg Message, data *Stats) {
data.Total++ /*
data.Total++
if msg.ReplyID != nil { if msg.ReplyID != nil {
data.Replies++ data.Replies++
}
if msg.FwdUser != nil {
data.Forward++
}
date := time.Unix(int64(msg.Date), 0)
data.ByHour[date.Hour()]++
data.ByWeekday[date.Weekday()]++
val, exists := data.ByUser[msg.From.Username]
if !exists {
val = 0
}
data.ByUser[msg.From.Username] = val + 1
datekey := date.Format("2006-1-2")
val, exists = data.ByDate[datekey]
if !exists {
val = 0
}
data.ByDate[datekey] = val + 1
data.Username[msg.From.Username] = msg.From.FirstName
*/
if len(msg.Text) > 2 {
wordList := strings.Split(msg.Text, " ")
for _, word := range wordList {
if len(word) < 3 {
continue
}
word = strings.ToLower(word)
if strings.HasPrefix(word, "http") {
continue
}
word = strings.Trim(word, " ?!.,:;/-_()[]{}'\"+=*^\n")
count, ok := data.Words[word]
if !ok {
count = make(UserCount)
}
val, ok := count[msg.From.Username]
if !ok {
val = 0
}
count[msg.From.Username] = val + 1
data.Words[word] = count
}
} }
if msg.FwdUser != nil {
data.Forward++
}
date := time.Unix(int64(msg.Date), 0)
data.ByHour[date.Hour()]++
data.ByWeekday[date.Weekday()]++
val, exists := data.ByUser[msg.From.Username]
if !exists {
val = 0
}
data.ByUser[msg.From.Username] = val + 1
datekey := date.Format("2006-1-2")
val, exists = data.ByDate[datekey]
if !exists {
val = 0
}
data.ByDate[datekey] = val + 1
data.Username[msg.From.Username] = msg.From.FirstName
} }
func MakeUint(bval []byte, bucketName string, key string) uint64 { func MakeUint(bval []byte, bucketName string, key string) uint64 {
@ -138,114 +168,150 @@ func PutUint(value uint64) []byte {
func update(db *bolt.DB, data Stats) error { func update(db *bolt.DB, data Stats) error {
return db.Update(func(tx *bolt.Tx) error { return db.Update(func(tx *bolt.Tx) error {
b, err := tx.CreateBucketIfNotExists([]byte("global")) /*
if err != nil { b, err := tx.CreateBucketIfNotExists([]byte("global"))
return err
}
// Update total
total := MakeUint(b.Get([]byte("count")), "global", "count")
total += data.Total
err = b.Put([]byte("count"), PutUint(total))
if err != nil {
return err
}
// Update replies
replies := MakeUint(b.Get([]byte("replies")), "global", "replies")
replies += data.Replies
err = b.Put([]byte("replies"), PutUint(total))
if err != nil {
return err
}
// Update forward
forward := MakeUint(b.Get([]byte("forward")), "global", "forward")
forward += data.Forward
err = b.Put([]byte("forward"), PutUint(total))
if err != nil {
return err
}
// Update hour counters
b, err = tx.CreateBucketIfNotExists([]byte("hour"))
if err != nil {
return err
}
for i := 0; i < 24; i++ {
curhour := MakeUint(b.Get([]byte{byte(i)}), "hour", strconv.Itoa(i))
curhour += data.ByHour[i]
err = b.Put([]byte{byte(i)}, PutUint(curhour))
if err != nil { if err != nil {
return err return err
} }
}
// Update weekday counters // Update total
b, err = tx.CreateBucketIfNotExists([]byte("weekday")) total := MakeUint(b.Get([]byte("count")), "global", "count")
if err != nil { total += data.Total
return err err = b.Put([]byte("count"), PutUint(total))
}
for i := 0; i < 7; i++ {
curwday := MakeUint(b.Get([]byte{byte(i)}), "weekday", strconv.Itoa(i))
curwday += data.ByWeekday[i]
err = b.Put([]byte{byte(i)}, PutUint(curwday))
if err != nil { if err != nil {
return err return err
} }
}
// Update date counters // Update replies
b, err = tx.CreateBucketIfNotExists([]byte("date")) replies := MakeUint(b.Get([]byte("replies")), "global", "replies")
if err != nil { replies += data.Replies
return err err = b.Put([]byte("replies"), PutUint(total))
}
for day, count := range data.ByDate {
count += MakeUint(b.Get([]byte(day)), "date", day)
err = b.Put([]byte(day), PutUint(count))
if err != nil { if err != nil {
return err return err
} }
}
// Update user counters // Update forward
b, err = tx.CreateBucketIfNotExists([]byte("users-count")) forward := MakeUint(b.Get([]byte("forward")), "global", "forward")
if err != nil { forward += data.Forward
return err err = b.Put([]byte("forward"), PutUint(total))
}
for user, count := range data.ByUser {
// Why do I even need this?
if len(user) < 1 {
continue
}
count += MakeUint(b.Get([]byte(user)), "users-count", user)
err = b.Put([]byte(user), PutUint(count))
if err != nil { if err != nil {
return err return err
} }
}
// Add to username table exclusively if not already present // Update hour counters
b, err = tx.CreateBucketIfNotExists([]byte("usernames")) b, err = tx.CreateBucketIfNotExists([]byte("hour"))
if err != nil { if err != nil {
return err return err
}
for user, first := range data.Username {
// Why do I even need this? (2)
if len(user) < 1 {
continue
} }
val := b.Get([]byte(user))
if val == nil { for i := 0; i < 24; i++ {
err = b.Put([]byte(user), []byte(first)) curhour := MakeUint(b.Get([]byte{byte(i)}), "hour", strconv.Itoa(i))
curhour += data.ByHour[i]
err = b.Put([]byte{byte(i)}, PutUint(curhour))
if err != nil { if err != nil {
return err return err
} }
} }
// Update weekday counters
b, err = tx.CreateBucketIfNotExists([]byte("weekday"))
if err != nil {
return err
}
for i := 0; i < 7; i++ {
curwday := MakeUint(b.Get([]byte{byte(i)}), "weekday", strconv.Itoa(i))
curwday += data.ByWeekday[i]
err = b.Put([]byte{byte(i)}, PutUint(curwday))
if err != nil {
return err
}
}
// Update date counters
b, err = tx.CreateBucketIfNotExists([]byte("date"))
if err != nil {
return err
}
for day, count := range data.ByDate {
count += MakeUint(b.Get([]byte(day)), "date", day)
err = b.Put([]byte(day), PutUint(count))
if err != nil {
return err
}
}
// Update user counters
b, err = tx.CreateBucketIfNotExists([]byte("users-count"))
if err != nil {
return err
}
for user, count := range data.ByUser {
// Why do I even need this?
if len(user) < 1 {
continue
}
count += MakeUint(b.Get([]byte(user)), "users-count", user)
err = b.Put([]byte(user), PutUint(count))
if err != nil {
return err
}
}
// Add to username table exclusively if not already present
b, err = tx.CreateBucketIfNotExists([]byte("usernames"))
if err != nil {
return err
}
for user, first := range data.Username {
// Why do I even need this? (2)
if len(user) < 1 {
continue
}
val := b.Get([]byte(user))
if val == nil {
err = b.Put([]byte(user), []byte(first))
if err != nil {
return err
}
}
}
*/
// Add word frequency
b, err := tx.CreateBucketIfNotExists([]byte("words"))
if err != nil {
return err
}
for word, freq := range data.Words {
// Sanity check, you never know!
if len(word) < 1 {
continue
}
var count UserCount
val := b.Get([]byte(word))
if val == nil {
// No need to add, just apply the current count
count = freq
} else {
// Deserialize counter and add each user one by one
err = json.Unmarshal(val, &count)
if err != nil {
return err
}
for user, wcount := range freq {
count[user] += wcount
}
}
bval, err := json.Marshal(count)
if err != nil {
return err
}
err = b.Put([]byte(word), bval)
if err != nil {
return err
}
} }
return nil return nil