From 6cc72a502abb81c8245b352f14600be194cba7aa Mon Sep 17 00:00:00 2001 From: Carlos Gomez <76704548+crgmz@users.noreply.github.com> Date: Mon, 20 Oct 2025 11:46:58 -0400 Subject: [PATCH] Add smaller duplicate check to google takeout --- adapters/googlePhotos/googlephotos.go | 67 +++++++++++++++++++++++++++ app/upload/ui.go | 7 +-- app/upload/upload.go | 2 +- internal/fileevent/fileevents.go | 8 +++- internal/journal/journal.go | 44 +++++++++--------- 5 files changed, 102 insertions(+), 26 deletions(-) diff --git a/adapters/googlePhotos/googlephotos.go b/adapters/googlePhotos/googlephotos.go index 537e4055..0475a9a5 100644 --- a/adapters/googlePhotos/googlephotos.go +++ b/adapters/googlePhotos/googlephotos.go @@ -3,8 +3,10 @@ package gp import ( "bytes" "context" + "fmt" "io/fs" "log/slog" + "math" "path" "path/filepath" "sort" @@ -352,9 +354,53 @@ func (toc *TakeoutCmd) passTwo(ctx context.Context, gOut chan *assets.Group) err // image *assetFile // } +func formatBytes(s int64) string { + suffixes := []string{"B", "KB", "MB", "GB"} + bytes := float64(s) + base := 1024.0 + if bytes < base { + return fmt.Sprintf("%.0f %s", bytes, suffixes[0]) + } + exp := int64(0) + for bytes >= base && exp < int64(len(suffixes)-1) { + bytes /= base + exp++ + } + roundedSize := math.Round(bytes*10) / 10 + return fmt.Sprintf("%.1f %s", roundedSize, suffixes[exp]) +} + +type FileInfo struct { + matches []struct { + extension string + size int64 + } + largestSize int64 +} + func (toc *TakeoutCmd) handleDir(ctx context.Context, dir string, gOut chan *assets.Group) error { catalog := toc.catalogs[dir] + fileSizeByExt := map[string]*FileInfo{} + for name, a := range catalog.matchedFiles { + fileName := strings.TrimSuffix(name, path.Ext(name)) + fileInfo, ok := fileSizeByExt[fileName] + if !ok { + fileInfo = &FileInfo{} + fileSizeByExt[fileName] = fileInfo + } + fileInfo.matches = append(fileInfo.matches, struct { + extension string + size int64 + }{ + extension: path.Ext(a.OriginalFileName), + size: int64(a.FileSize), + }) + if int64(a.FileSize) > fileInfo.largestSize { + fileInfo.largestSize = int64(a.FileSize) + } + } + dirEntries := make([]*assets.Asset, 0, len(catalog.matchedFiles)) // Filter and sort the files @@ -362,6 +408,27 @@ func (toc *TakeoutCmd) handleDir(ctx context.Context, dir string, gOut chan *ass a := catalog.matchedFiles[name] key := fileKeyTracker{baseName: name, size: int64(a.FileSize)} track, _ := toc.fileTracker.Load(key) // track := to.fileTracker[key] + + fileName := strings.TrimSuffix(name, path.Ext(name)) + fileInfo := fileSizeByExt[fileName] + if fileInfo != nil && int64(a.FileSize) < fileInfo.largestSize { + sortedMatches := make([]struct { + extension string + size int64 + }, len(fileInfo.matches)) + copy(sortedMatches, fileInfo.matches) + sort.Slice(sortedMatches, func(i, j int) bool { + return sortedMatches[i].size > sortedMatches[j].size + }) + + var matchesInfo []string + for _, m := range sortedMatches { + matchesInfo = append(matchesInfo, fmt.Sprintf("[%s-%s]", m.extension, formatBytes(m.size))) + } + + toc.logMessage(ctx, fileevent.AnalysisSmallerLocalDuplicate, a.File, strings.Join(matchesInfo, "; ")) + continue + } if track.status == fileevent.Uploaded { a.Close() toc.logMessage(ctx, fileevent.AnalysisLocalDuplicate, a.File, "local duplicate") diff --git a/app/upload/ui.go b/app/upload/ui.go index 45c3b53e..37c547d4 100644 --- a/app/upload/ui.go +++ b/app/upload/ui.go @@ -270,10 +270,11 @@ func (uc *UpCmd) newUI(ctx context.Context, a *app.Application) *uiPage { ui.addCounter(ui.prepareCounts, 3, "Discarded files", fileevent.DiscoveredDiscarded) ui.addCounter(ui.prepareCounts, 4, "Unsupported files", fileevent.DiscoveredUnsupported) ui.addCounter(ui.prepareCounts, 5, "Duplicates in the input", fileevent.AnalysisLocalDuplicate) - ui.addCounter(ui.prepareCounts, 6, "Files with a sidecar", fileevent.AnalysisAssociatedMetadata) - ui.addCounter(ui.prepareCounts, 7, "Files without sidecar", fileevent.AnalysisMissingAssociatedMetadata) + ui.addCounter(ui.prepareCounts, 6, "Smaller Duplicates", fileevent.AnalysisSmallerLocalDuplicate) + ui.addCounter(ui.prepareCounts, 7, "Files with a sidecar", fileevent.AnalysisAssociatedMetadata) + ui.addCounter(ui.prepareCounts, 8, "Files without sidecar", fileevent.AnalysisMissingAssociatedMetadata) - ui.prepareCounts.SetSize(8, 2, 1, 1).SetColumns(30, 10) + ui.prepareCounts.SetSize(9, 2, 1, 1).SetColumns(30, 10) ui.uploadCounts = tview.NewGrid() ui.uploadCounts.SetBorder(true).SetTitle("Uploading") diff --git a/app/upload/upload.go b/app/upload/upload.go index 4142e312..4e117286 100644 --- a/app/upload/upload.go +++ b/app/upload/upload.go @@ -132,7 +132,7 @@ func NewUploadCommand(ctx context.Context, app *app.Application) *cobra.Command // Run is called back by the actual asset reader func (uc *UpCmd) Run(cmd *cobra.Command, adapter adapters.Reader) error { - uc.Mode = UpModeFolder // TODO + uc.Mode = UpModeGoogleTakeout // TODO // ready to run ctx := cmd.Context() diff --git a/internal/fileevent/fileevents.go b/internal/fileevent/fileevents.go index ff464a3e..11fe4506 100644 --- a/internal/fileevent/fileevents.go +++ b/internal/fileevent/fileevents.go @@ -34,6 +34,7 @@ const ( AnalysisAssociatedMetadata AnalysisMissingAssociatedMetadata AnalysisLocalDuplicate + AnalysisSmallerLocalDuplicate UploadNotSelected UploadUpgraded // = "Server's asset upgraded" @@ -70,6 +71,7 @@ var _code = map[Code]string{ AnalysisAssociatedMetadata: "associated metadata file", AnalysisMissingAssociatedMetadata: "missing associated metadata file", AnalysisLocalDuplicate: "file duplicated in the input", + AnalysisSmallerLocalDuplicate: "smaller duplicate", UploadNotSelected: "file not selected", UploadUpgraded: "server's asset upgraded with the input", @@ -100,6 +102,7 @@ var _logLevels = map[Code]slog.Level{ AnalysisAssociatedMetadata: slog.LevelInfo, AnalysisMissingAssociatedMetadata: slog.LevelWarn, AnalysisLocalDuplicate: slog.LevelWarn, + AnalysisSmallerLocalDuplicate: slog.LevelWarn, UploadNotSelected: slog.LevelWarn, UploadUpgraded: slog.LevelInfo, UploadServerBetter: slog.LevelInfo, @@ -178,6 +181,7 @@ func (r *Recorder) Report() string { DiscoveredDiscarded, DiscoveredUnsupported, AnalysisLocalDuplicate, + AnalysisSmallerLocalDuplicate, AnalysisAssociatedMetadata, AnalysisMissingAssociatedMetadata, } { @@ -195,6 +199,7 @@ func (r *Recorder) Report() string { DiscoveredDiscarded, DiscoveredUnsupported, AnalysisLocalDuplicate, + AnalysisSmallerLocalDuplicate, AnalysisAssociatedMetadata, AnalysisMissingAssociatedMetadata, } { @@ -258,7 +263,8 @@ func (r *Recorder) TotalProcessed(forcedMissingJSON bool) int64 { atomic.LoadInt64(&r.counts[UploadServerDuplicate]) + atomic.LoadInt64(&r.counts[UploadServerBetter]) + atomic.LoadInt64(&r.counts[DiscoveredDiscarded]) + - atomic.LoadInt64(&r.counts[AnalysisLocalDuplicate]) + atomic.LoadInt64(&r.counts[AnalysisLocalDuplicate]) + + atomic.LoadInt64(&r.counts[AnalysisSmallerLocalDuplicate]) if !forcedMissingJSON { v += atomic.LoadInt64(&r.counts[AnalysisMissingAssociatedMetadata]) } diff --git a/internal/journal/journal.go b/internal/journal/journal.go index 2c916950..f9dc58d2 100644 --- a/internal/journal/journal.go +++ b/internal/journal/journal.go @@ -14,26 +14,27 @@ type Journal struct { type Action string const ( - DiscoveredFile Action = "File" - ScannedImage Action = "Scanned image" - ScannedVideo Action = "Scanned video" - Discarded Action = "Discarded" - Uploaded Action = "Uploaded" - Upgraded Action = "Server's asset upgraded" - ERROR Action = "Error" - LocalDuplicate Action = "Local duplicate" - ServerDuplicate Action = "Server has photo" - Stacked Action = "Stacked" - ServerBetter Action = "Server's asset is better" - Album Action = "Added to an album" - LivePhoto Action = "Live photo" - FailedVideo Action = "Failed video" - Unsupported Action = "File type not supported" - Metadata Action = "Metadata files" - AssociatedMetadata Action = "Associated with metadata" - INFO Action = "Info" - NotSelected Action = "Not selected because options" - ServerError Action = "Server error" + DiscoveredFile Action = "File" + ScannedImage Action = "Scanned image" + ScannedVideo Action = "Scanned video" + Discarded Action = "Discarded" + Uploaded Action = "Uploaded" + Upgraded Action = "Server's asset upgraded" + ERROR Action = "Error" + LocalDuplicate Action = "Local duplicate" + SmallerLocalDuplicate Action = "Smaller Local duplicate" + ServerDuplicate Action = "Server has photo" + Stacked Action = "Stacked" + ServerBetter Action = "Server's asset is better" + Album Action = "Added to an album" + LivePhoto Action = "Live photo" + FailedVideo Action = "Failed video" + Unsupported Action = "File type not supported" + Metadata Action = "Metadata files" + AssociatedMetadata Action = "Associated with metadata" + INFO Action = "Info" + NotSelected Action = "Not selected because options" + ServerError Action = "Server error" ) func NewJournal(log Logger) *Journal { @@ -71,7 +72,7 @@ func (j *Journal) AddEntry(file string, action Action, comment ...string) { func (j *Journal) Report() { checkFiles := j.counts[ScannedImage] + j.counts[ScannedVideo] + j.counts[Metadata] + j.counts[Unsupported] + j.counts[FailedVideo] + j.counts[Discarded] - handledFiles := j.counts[NotSelected] + j.counts[LocalDuplicate] + j.counts[ServerDuplicate] + j.counts[ServerBetter] + j.counts[Uploaded] + j.counts[Upgraded] + j.counts[ServerError] + handledFiles := j.counts[NotSelected] + j.counts[LocalDuplicate] + j.counts[SmallerLocalDuplicate] + j.counts[ServerDuplicate] + j.counts[ServerBetter] + j.counts[Uploaded] + j.counts[Upgraded] + j.counts[ServerError] j.Log.OK("Scan of the sources:") j.Log.OK("%6d files in the input", j.counts[DiscoveredFile]) j.Log.OK("--------------------------------------------------------") @@ -91,6 +92,7 @@ func (j *Journal) Report() { j.Log.OK("%6d files already on the server", j.counts[ServerDuplicate]) j.Log.OK("%6d discarded files because of options", j.counts[NotSelected]) j.Log.OK("%6d discarded files because duplicated in the input", j.counts[LocalDuplicate]) + j.Log.OK("%6d discarded files because smaller duplicate", j.counts[SmallerLocalDuplicate]) j.Log.OK("%6d discarded files because server has a better image", j.counts[ServerBetter]) j.Log.OK("%6d errors when uploading", j.counts[ServerError])