diff --git a/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 b/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 index 18cd0cb..775e113 100644 --- a/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 +++ b/PSOneTools/2.4/Find-PSOneDuplicateFileFast.ps1 @@ -1,245 +1,263 @@ -function Find-PSOneDuplicateFileFast -{ - <# - .SYNOPSIS - Identifies files with duplicate content and uses a partial hash for large files to speed calculation up +function Find-PSOneDuplicateFileFast { + <# + .SYNOPSIS + Identifies files with duplicate content and uses a partial hash for large files to speed calculation up - .DESCRIPTION - Returns a hashtable with the hashes that have at least two files (duplicates). Large files with partial hashes are suffixed with a "P". - Large files with a partial hash can be falsely positive: they may in fact be different even though the partial hash is the same - You either need to calculate the full hash for these files to be absolutely sure, or add -TestPartialHash. - Calculating a full hash for large files may take a very long time though. So you may be better off using other - strategies to identify duplicate file content, i.e. look at identical creation times, etc. + .DESCRIPTION + Returns a hashtable with the hashes that have at least two files (duplicates). Large files with partial hashes are suffixed with a "P". + Large files with a partial hash can be falsely positive: they may in fact be different even though the partial hash is the same + You either need to calculate the full hash for these files to be absolutely sure, or add -TestPartialHash. + Calculating a full hash for large files may take a very long time though. So you may be better off using other + strategies to identify duplicate file content, i.e. look at identical creation times, etc. - .EXAMPLE - $Path = [Environment]::GetFolderPath('MyDocuments') - Find-PSOneDuplicateFileFast -Path $Path - Find duplicate files in the user documents folder + .EXAMPLE + $Path = [Environment]::GetFolderPath('MyDocuments') + Find-PSOneDuplicateFileFast -Path $Path + Find duplicate files in the user documents folder - .EXAMPLE - Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log - find log files in the Windows folder with duplicate content + .EXAMPLE + Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log + Find log files in the C:\Windows folder with duplicate content - .LINK - https://powershell.one - #> + .EXAMPLE + Find-PSOneDuplicateFileFast -Filter *.jpg -MaxFileSize 2MB -AlgorithmName MD5 -TestPartialHash + This command will search the current folder for JPG files and will hash the + first 2MB using the MD5 algorithm. If there are duplicates of the partial hash + values it will check the files using the full file size to ensure that the files + are truly duplicates. + .LINK + https://powershell.one - param - ( - # Path of folder to recursively search - [String] - [Parameter(Mandatory)] - $Path, - - # Filter to apply. Default is '*' (all Files) - [String] - $Filter = '*', - - # when there are multiple files with same partial hash - # they may still be different. When setting this switch, - # full hashes are calculated which may take a very long time - # for large files and/or slow networks - [switch] - $TestPartialHash, - - # use partial hashes for files larger than this: - [int64] - $MaxFileSize = 100KB - ) - - # get a hashtable of all files of size greater 0 - # grouped by their length - - - # ENUMERATE ALL FILES RECURSIVELY - # call scriptblocks directly and pipe them together - # this is by far the fastest way and much faster than - # using Foreach-Object: - & { - try - { - # try and use the fast API way of enumerating files recursively - # this FAILS whenever there is any "Access Denied" errors - Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method' - [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories') - } - catch - { - # use PowerShell's own (slow) way of enumerating files if any error occurs: - Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method' - Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore - } - } | - # EXCLUDE EMPTY FILES: - # use direct process blocks with IF (which is much faster than Where-Object): - & { - process - { - # if the file has content... - if ($_.Length -gt 0) - { - # let it pass through: - $_ - } - } - } | - # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE - # OTHER FILE WITH SAME SIZE - # use direct scriptblocks with own hashtable (which is much faster than Group-Object) - & { - begin - # start with an empty hashtable - { $hash = @{} } + .NOTES + Updated by Steven Judd on 2021/01/24: + Added ValidateScript to Path parameter to ensure the value is a directory and set the path default to the current path + Set the Filter parameter value for the enumeration of the files (it was set to always check all files) + Added AlgorithmName parameter to allow the algorithm to be specified + Added example to show the default path and how to use the MaxFileSize, AlgorithmName, and TestPartialHas parameters + Set positional parameter values on Path and Filter + #> - process - { - # group files by their length - # (use "length" as hashtable key) - $file = $_ - $key = $file.Length.toString() - - # if we see this key for the first time, create a generic - # list to hold group items, and store FileInfo objects in this list - # (specialized generic lists are faster than ArrayList): - if ($hash.ContainsKey($key) -eq $false) - { - $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new() - } - # add file to appropriate hashtable key: - $hash[$key].Add($file) - } - - end - { - # return only the files from groups with at least two files - # (if there is only one file with a given length, then it - # cannot have any duplicates for sure): - foreach($pile in $hash.Values) - { - # are there at least 2 files in this pile? - if ($pile.Count -gt 1) - { - # yes, add it to the candidates - $pile - } - } - } - } | - # CALCULATE THE NUMBER OF FILES TO HASH - # collect all files and hand over en-bloc - & { - end { ,@($input) } - } | - # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES: - # use a direct scriptblock call with a hashtable (much faster than Group-Object): - & { - begin - { - # start with an empty hashtable - $hash = @{} - - # since this is a length procedure, a progress bar is in order - # keep a counter of processed files: - $c = 0 - } - - process - { - $totalNumber = $_.Count - foreach($file in $_) - { - - # update progress bar - $c++ - - # update progress bar every 20 files: - if ($c % 20 -eq 0 -or $file.Length -gt 100MB) - { - $percentComplete = $c * 100 / $totalNumber - Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete - } - - # use the file hash of this file PLUS file length as a key to the hashtable - # use the fastest algorithm SHA1, and use partial hashes for files larger than 100KB: - $bufferSize = [Math]::Min(100KB, $MaxFileSize) - $result = Get-PsOneFileHash -StartPosition 1KB -Length $MaxFileSize -BufferSize $bufferSize -AlgorithmName SHA1 -Path $file.FullName + param( + # Enter the Path of the folder to recursively search for duplicate files. + # The default value is the current folder. + [String] + [Parameter(Position = 0)] + [ValidateScript( { + if (Test-Path -Path $_ -PathType Container) { + return $true + } + else { + #Test-Path check failed + throw "Path `'$_`' is invalid. It must be a directory." + } + })] + $Path = '.', + + # Enter a filter value to apply to the file search. + # The default value is '*' (all Files) + [String] + [Parameter(Position = 1)] + $Filter = '*', - # add a "P" to partial hashes: - if ($result.IsPartialHash) { - $partialHash = 'P' - } - else - { - $partialHash = '' - } + # When there are multiple files with same partial hash they may still be different. + # When setting this switch, full hashes are calculated for all partial hashes. + # Caution: setting this switch parameter may take a very long time for large + # files and/or network paths. + [switch] + $TestPartialHash, + # If the file size is larger than the MaxFileSize value the function will use a + # partial hash using the specified amount of the beginning of the file. + # The default value is 100KB. + [int64] + $MaxFileSize = 100KB, + + # Select the hash algorithm to use. The fastest algorithm is SHA1. MD5 is second best + # in terms of speed. Slower algorithms provide more secure hashes with a lesser chance + # of duplicates with different content. + # The default value is SHA1. + [Security.Cryptography.HashAlgorithmName] + [ValidateSet("SHA1", "SHA256", "SHA384", "SHA512", "MD5")] + $AlgorithmName = 'SHA1', + + # Gets the items in the specified locations and in all child items of the locations. + [switch] + $Recurse + ) + + # get a hashtable of all files of size greater 0 + # grouped by their length - $key = '{0}:{1}{2}' -f $result.Hash, $file.Length, $partialHash - - # if we see this key the first time, add a generic list to this key: - if ($hash.ContainsKey($key) -eq $false) - { - $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new()) + # ENUMERATE ALL FILES RECURSIVELY + # call scriptblocks directly and pipe them together + # this is by far the fastest way and much faster than + # using Foreach-Object: + & { + try { + # try and use the fast API way of enumerating files recursively + # this FAILS whenever there is any "Access Denied" errors + Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method' + if ($Recurse) { + [IO.DirectoryInfo]::new($Path).GetFiles($Filter, 'AllDirectories') + } + else { + [IO.DirectoryInfo]::new($Path).GetFiles($Filter, 'TopDirectoryOnly') + } } - - # add the file to the approriate group: - $hash[$key].Add($file) - } - } - - end - { - # remove all hashtable keys with only one file in them - - - - # do a detail check on partial hashes - if ($TestPartialHash) - { - # first, CLONE the list of hashtable keys - # (we cannot remove hashtable keys while enumerating the live - # keys list): - $keys = @($hash.Keys).Clone() - $i = 0 - Foreach($key in $keys) - { - $i++ - $percentComplete = $i * 100 / $keys.Count - if ($hash[$key].Count -gt 1 -and $key.EndsWith('P')) - { - foreach($file in $hash[$key]) - { - Write-Progress -Activity 'Hashing Full File Content' -Status $file.Name -PercentComplete $percentComplete - $result = Get-FileHash -Path $file.FullName -Algorithm SHA1 - $newkey = '{0}:{1}' -f $result.Hash, $file.Length - if ($hash.ContainsKey($newkey) -eq $false) - { - $hash.Add($newkey, [Collections.Generic.List[System.IO.FileInfo]]::new()) - } - $hash[$newkey].Add($file) + catch { + # use PowerShell's own (slow) way of enumerating files if any error occurs: + Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method' + if ($Recurse) { + Get-ChildItem -Path $Path -Filter $Filter -File -Recurse -ErrorAction Ignore + } + else { + Get-ChildItem -Path $Path -Filter $Filter -File -ErrorAction Ignore } - $hash.Remove($key) - } } - } - - # enumerate all keys... - $keys = @($hash.Keys).Clone() - - foreach($key in $keys) - { - # ...if key has only one file, remove it: - if ($hash[$key].Count -eq 1) - { - $hash.Remove($key) + } | + # EXCLUDE EMPTY FILES: + # use direct process blocks with IF (which is much faster than Where-Object): + & { + process { + # if the file has content... + if ($_.Length -gt 0) { + # let it pass through: + $_ + } } - } - - - - # return the hashtable with only duplicate files left: - $hash - } - } -} \ No newline at end of file + } | + # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE + # OTHER FILE WITH SAME SIZE + # use direct scriptblocks with own hashtable (which is much faster than Group-Object) + & { + begin + # start with an empty hashtable + { $hash = @{ } } + + process { + # group files by their length + # (use "length" as hashtable key) + $file = $_ + $key = $file.Length.toString() + + # if we see this key for the first time, create a generic + # list to hold group items, and store FileInfo objects in this list + # (specialized generic lists are faster than ArrayList): + if ($hash.ContainsKey($key) -eq $false) { + $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new() + } + # add file to appropriate hashtable key: + $hash[$key].Add($file) + } #end process block + + end { + # return only the files from groups with at least two files + # (if there is only one file with a given length, then it + # cannot have any duplicates for sure): + foreach ($pile in $hash.Values) { + # are there at least 2 files in this pile? + if ($pile.Count -gt 1) { + # yes, add it to the candidates + $pile + } + } #end foreach ($pile in $hash.Values) + } #end end block + } | + # CALCULATE THE NUMBER OF FILES TO HASH + # collect all files and hand over en-bloc + & { + end { , @($input) } + } | + # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES: + # use a direct scriptblock call with a hashtable (much faster than Group-Object): + & { + begin { + # start with an empty hashtable + $hash = @{ } + + # since this is a length procedure, a progress bar is in order + # keep a counter of processed files: + $c = 0 + } #end begin block + + process { + $totalNumber = $_.Count + foreach ($file in $_) { + + # update progress bar + $c++ + + # update progress bar every 20 files: + if ($c % 20 -eq 0 -or $file.Length -gt 100MB) { + $percentComplete = $c * 100 / $totalNumber + Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete + } + + # determine the buffer size from the smaller of 100KB or $MaxFileSize + $bufferSize = [Math]::Min(100KB, $MaxFileSize) + # use the specified algorithm and return partial hashes for files larger than $MaxFileSize: + $result = Get-PsOneFileHash -StartPosition 1KB -Length $MaxFileSize -BufferSize $bufferSize -AlgorithmName $AlgorithmName -Path $file.FullName + + # add a "P" to partial hashes: + if ($result.IsPartialHash) { + $partialHash = 'P' + } + else { + $partialHash = '' + } + + # use the file hash of this file PLUS file length as a key to the hashtable + $key = '{0}:{1}{2}' -f $result.Hash, $file.Length, $partialHash + + # if we see this key the first time, add a generic list to this key: + if ($hash.ContainsKey($key) -eq $false) { + $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new()) + } + + # add the file to the approriate group: + $hash[$key].Add($file) + } #end foreach ($file in $_) + } #end process block + + end { + # do a detail check on partial hashes if $TestDuplicatePartialHashes + if ($TestPartialHash) { + # first, CLONE the list of hashtable keys + # (we cannot remove hashtable keys while enumerating the live keys list): + $keys = @($hash.Keys).Clone() + $i = 0 + foreach ($key in $keys) { + $i++ + $percentComplete = $i * 100 / $keys.Count + if ($hash[$key].Count -gt 1 -and $key.EndsWith('P')) { + foreach ($file in $hash[$key]) { + Write-Progress -Activity 'Hashing Full File Content' -Status $file.Name -PercentComplete $percentComplete + $result = Get-FileHash -Path $file.FullName -Algorithm $AlgorithmName + $newkey = '{0}:{1}' -f $result.Hash, $file.Length + if ($hash.ContainsKey($newkey) -eq $false) { + $hash.Add($newkey, [Collections.Generic.List[System.IO.FileInfo]]::new()) + } + $hash[$newkey].Add($file) + } #end foreach ($file in $hash[$key]) + #remove the partial key entry with more than one file from the $hash hashtable + $hash.Remove($key) + } #end if ($hash[$key].Count -gt 1 -and $key.EndsWith('P')) + } #end foreach ($key in $keys) + } #end if ($TestPartialHash) + + # enumerate all keys + $keys = @($hash.Keys).Clone() + + foreach ($key in $keys) { + # if key has only one file, remove it: + if ($hash[$key].Count -eq 1) { + $hash.Remove($key) + } + } #end foreach ($key in $keys) + + # return the hashtable with only duplicate files left: + $hash + } #end end block + } #end of last piped code block +} #end Find-PSOneDuplicateFileFast function \ No newline at end of file diff --git a/PSOneTools/2.4/Get-PSOneFileHash.ps1 b/PSOneTools/2.4/Get-PSOneFileHash.ps1 index a6d3adc..d1797c6 100644 --- a/PSOneTools/2.4/Get-PSOneFileHash.ps1 +++ b/PSOneTools/2.4/Get-PSOneFileHash.ps1 @@ -1,8 +1,8 @@ -function Get-PsOneFileHash -{ +function Get-PsOneFileHash { <# .SYNOPSIS - Calculates a unique hash value for file content and strings, and is capable of calculating partial hashes to speed up calculation for large content + Calculates a unique hash value for file content and strings, and is capable of calculating + partial hashes to speed up calculation for large content .DESCRIPTION Calculates a cryptographic hash for file content and strings to identify identical content. @@ -17,13 +17,13 @@ you calculate the expensive full hash only for files that have potential duplicates. .EXAMPLE - Get-PsOneFileHash -String "Hello World!" -Algorithm MD5 + Get-PsOneFileHash -String "Hello World!" -AlgorithmName MD5 Calculates the hash for a string using the MD5 algorithm .EXAMPLE - Get-PSOneFileHash -Path "$home\Documents\largefile.mp4" -StartPosition 1000 -Length 1MB -Algorithm SHA1 - Calculates the hash for the file content. If the file is larger than 1MB+1000, a partial hash is calculated, - starting at byte position 1000, and using 1MB of data + Get-PSOneFileHash -Path "$home\Documents\largefile.mp4" -StartPosition 1000 -Length 1MB -AlgorithmName SHA1 + Calculates the hash for the file content. If the file is larger than 1MB+1000, a partial + hash is calculated, starting at byte position 1000, and using 1MB of data .EXAMPLE Get-ChildItem -Path $home -Recurse -File -ErrorAction SilentlyContinue | @@ -34,62 +34,82 @@ $_.Group | Select-Object -Property Length, Hash, Path } | Out-GridView -Title 'Potential Duplicate Files' - Takes all files from the user profile and calculates a hash for each. Large files use a partial hash. - Results are grouped by hash and length. Any group with more than one member contains potential - duplicates. These are shown in a gridview. + Takes all files from the user profile and calculates a hash for each. Large files use a + partial hash. Results are grouped by hash and length. Any group with more than one member + contains potential duplicates. The final results are shown in a gridview. .LINK https://powershell.one - #> + .NOTES + Updated on 2021-01-24 by Steven Judd: + Added parameter validation to Path parameter + Updated parameter help to better explain parameter usage + Set Path param to be able to accept an array + Fixed parameter in help text example to use full parameter name + #> - [CmdletBinding(DefaultParameterSetName='File')] - param - ( - [Parameter(Mandatory,ValueFromPipeline,ValueFromPipelineByPropertyName,ParameterSetName='File',Position=0)] - [string] + [CmdletBinding(DefaultParameterSetName = 'File')] + + param ( + # Path to file with hashable content. This must be a path to a file. It uses an alias + # of "FullName" to allow the passing of Get-Item or Get-ChildItem objects to this + # function using the pipeline. + [Parameter(Mandatory, ValueFromPipeline, ValueFromPipelineByPropertyName, ParameterSetName = 'File', Position = 0)] + [string[]] [Alias('FullName')] - # path to file with hashable content + [ValidateScript( { + if (Test-Path -Path $_ -PathType Leaf) { + return $true + } + else { + #Test-Path check failed + throw "Path '$_' is invalid. It must be a file." + } + })] $Path, - [Parameter(Mandatory,ValueFromPipeline,ParameterSetName='String',Position=0)] + # String content to hash. This will allow the hashing of content that is not coming from a + # file's contents. + [Parameter(Mandatory, ValueFromPipeline, ParameterSetName = 'String', Position = 0)] [string] - # path to file with hashable content $String, + # Specify the byte position to start hashing. If no value is specified the default value + # is 1000 to skip past the standard file header content. [int] - [ValidateRange(0,1TB)] - # byte position to start hashing + [ValidateRange(0, 1TB)] $StartPosition = 1000, + # Specify the number of bytes to hash. Larger length increases accuracy of hash, whereas a + # smaller length increases hash calculation performance but runs the risk of duplications. + # The default value is 1MB. [long] - [ValidateRange(1KB,1TB)] - # bytes to hash. Larger length increases accuracy of hash. - # Smaller length increases hash calculation performance + [ValidateRange(1KB, 1TB)] $Length = 1MB, + # Specify an internal buffer size to read chunks. A larger buffer increases raw reading + # speed but slows down overall performance when too many bytes are read and increases + # memory pressure. Ideally, the Length parameter value should be equally dividable by the + # BufferSize parameter value. + # The default value is 32KB. [int] - # internal buffer size to read chunks - # a larger buffer increases raw reading speed but slows down - # overall performance when too many bytes are read and increases - # memory pressure - # Ideally, length should be equally dividable by this $BufferSize = 32KB, + # Select the hash algorithm to use. The fastest algorithm is SHA1. MD5 is second best + # in terms of speed. Slower algorithms provide more secure hashes with a lesser chance + # of duplicates with different content. + # The default value is SHA1 [Security.Cryptography.HashAlgorithmName] - [ValidateSet('MD5','SHA1','SHA256','SHA384','SHA512')] - # hash algorithm to use. The fastest algorithm is SHA1. MD5 is second best - # in terms of speed. Slower algorithms provide more secure hashes with a - # lesser chance of duplicates with different content + [ValidateSet('MD5', 'SHA1', 'SHA256', 'SHA384', 'SHA512')] $AlgorithmName = 'SHA1', + # This parameter will override partial hashing and always calculate the full hash. [Switch] - # overrides partial hashing and always calculates the full hash $Force ) - begin - { + begin { # what's the minimum size required for partial hashing? $minDataLength = $BufferSize + $StartPosition @@ -100,154 +120,129 @@ # are we hashing a file or a string? $isFile = $PSCmdlet.ParameterSetName -eq 'File' } - - process - { - # prepare the return object: - $result = [PSCustomObject]@{ - Path = $Path - Length = 0 - Algorithm = $AlgorithmName - Hash = '' - IsPartialHash = $false - StartPosition = $StartPosition - HashedContentSize = $Length - } - if ($isFile) - { - try - { - # check whether the file size is greater than the limit we set: - $file = [IO.FileInfo]$Path - $result.Length = $file.Length - - # test whether partial hashes should be used: - $result.IsPartialHash = ($result.Length -gt $minDataLength) -and (-not $Force.IsPresent) - } - catch - { - throw "Unable to access $Path" + process { + foreach ($item in $Path) { + # prepare the return object: + $result = [PSCustomObject]@{ + Path = $item + Length = 0 + Algorithm = $AlgorithmName + Hash = '' + IsPartialHash = $false + StartPosition = $StartPosition + HashedContentSize = $Length } - } - else - { - $result.Length = $String.Length - $result.IsPartialHash = ($result.Length -gt $minDataLength) -and (-not $Force.IsPresent) - } - # initialize the hash algorithm to use - # I decided to initialize the hash engine for every file to avoid collisions - # when using transform blocks. I am not sure whether this is really necessary, - # or whether initializing the hash engine in the begin() block is safe. - try - { - $algorithm = [Security.Cryptography.HashAlgorithm]::Create($algorithmName) - } - catch - { - throw "Unable to initialize algorithm $AlgorithmName" - } - try - { - if ($isFile) - { - # read the file, and make sure the file isn't changed while we read it: - $stream = [IO.File]::Open($Path, [IO.FileMode]::Open, [IO.FileAccess]::Read, [IO.FileShare]::Read) - - # is the file larger than the threshold so that a partial hash - # should be calculated? - if ($result.IsPartialHash) - { - # keep a counter of the bytes that were read for this file: - $bytesToRead = $Length - - # move to the requested start position inside the file content: - $stream.Position = $StartPosition - - # read the file content in chunks until the requested data is fed into the - # hash algorithm - while($bytesToRead -gt 0) - { - # either read the full chunk size, or whatever is left to read the desired - # total length: - $bytesRead = $stream.Read($buffer, 0, [Math]::Min($bytesToRead, $bufferSize)) - - # we should ALWAYS read at least one byte: - if ($bytesRead -gt 0) - { - # subtract the bytes read from the total number of bytes to read - # in order to calculate how many bytes need to be read in the next - # iteration of this loop: - $bytesToRead -= $bytesRead - - # if there won't be any more bytes to read, this is the last chunk of data, - # so we can finalize hash generation: - if ($bytesToRead -eq 0) - { - $null = $algorithm.TransformFinalBlock($buffer, 0, $bytesRead) - } - # else, if there are more bytes to follow, simply add them to the hash - # algorithm: - else - { - $null = $algorithm.TransformBlock($buffer, 0, $bytesRead, $buffer, 0) - } - } - else - { - throw 'This should never occur: no bytes read.' - } - } + if ($isFile) { + try { + # check whether the file size is greater than the limit we set: + $file = [IO.FileInfo]$item + $result.Length = $file.Length + + # test whether partial hashes should be used and if so sets IsPartialHash to $true: + $result.IsPartialHash = ($result.Length -gt $minDataLength) -and (-not $Force.IsPresent) } - else - { - # either the file was smaller than the buffer size, or -Force was used: - # the entire file hash is calculated: - $null = $algorithm.ComputeHash($stream) + catch { + throw "Unable to access $item" } + } #end if ($isFile) + else { + $result.Length = $String.Length + $result.IsPartialHash = ($result.Length -gt $minDataLength) -and (-not $Force.IsPresent) } - else - { - if ($result.IsPartialHash) - { - $bytes = [Text.Encoding]::UTF8.GetBytes($String.SubString($StartPosition, $Length)) - } - else - { - $bytes = [Text.Encoding]::UTF8.GetBytes($String) - } - - $null = $algorithm.ComputeHash($bytes) + # initialize the hash algorithm to use + # I decided to initialize the hash engine for every file to avoid collisions + # when using transform blocks. I am not sure whether this is really necessary, + # or whether initializing the hash engine in the begin() block is safe. + try { + $algorithm = [Security.Cryptography.HashAlgorithm]::Create($AlgorithmName) + } + catch { + throw "Unable to initialize algorithm $AlgorithmName" } + try { + if ($isFile) { + # read the file, and make sure the file isn't changed while we read it: + $stream = [IO.File]::Open($item, [IO.FileMode]::Open, [IO.FileAccess]::Read, [IO.FileShare]::Read) + + # is the file larger than the threshold so that a partial hash should be calculated? + if ($result.IsPartialHash) { + # keep a counter of the bytes that were read for this file: + $bytesToRead = $Length + + # move to the requested start position inside the file content: + $stream.Position = $StartPosition + + # read the file content in chunks until the requested data is fed into the hash algorithm + while ($bytesToRead -gt 0) { + # either read the full chunk size, or whatever is left to read the desired total length: + $bytesRead = $stream.Read($buffer, 0, [Math]::Min($bytesToRead, $bufferSize)) + + # we should ALWAYS read at least one byte: + if ($bytesRead -gt 0) { + # subtract the bytes read from the total number of bytes to read + # in order to calculate how many bytes need to be read in the next + # iteration of this loop: + $bytesToRead -= $bytesRead + + # if there won't be any more bytes to read, this is the last chunk of data, + # so we can finalize hash generation: + if ($bytesToRead -eq 0) { + $null = $algorithm.TransformFinalBlock($buffer, 0, $bytesRead) + } + # else, if there are more bytes to follow, simply add them to the hash + # algorithm: + else { + $null = $algorithm.TransformBlock($buffer, 0, $bytesRead, $buffer, 0) + } + } #end if ($bytesRead -gt 0) + else { + throw 'This should never occur: no bytes read.' + } + } #end while ($bytesToRead -gt 0) + } #end if ($result.IsPartialHash) + else { + # either the file was smaller than the buffer size, or -Force was used: + # the entire file hash is calculated: + $null = $algorithm.ComputeHash($stream) + } + } #end if ($isFile) + else { + if ($result.IsPartialHash) { + $bytes = [Text.Encoding]::UTF8.GetBytes($String.SubString($StartPosition, $Length)) + } + else { + $bytes = [Text.Encoding]::UTF8.GetBytes($String) + } + $null = $algorithm.ComputeHash($bytes) + } #end else - # the calculated hash is stored in the prepared return object: - $result.Hash = [BitConverter]::ToString($algorithm.Hash).Replace('-','') + # the calculated hash is stored in the prepared return object: + $result.Hash = [BitConverter]::ToString($algorithm.Hash).Replace('-', '') - if (!$result.IsPartialHash) - { - $result.StartPosition = 0 - $result.HashedContentSize = $result.Length - } - } - catch - { - throw "Unable to calculate partial hash: $_" - } - finally - { - if ($PSCmdlet.ParameterSetName -eq 'File') - { - # free stream - $stream.Close() - $stream.Dispose() + #if IsPartialHash is $false, set the StartPosition and HashedContentSize values + if (-not($result.IsPartialHash)) { + $result.StartPosition = 0 + $result.HashedContentSize = $result.Length + } + } #end try block + catch { + throw "Unable to calculate partial hash: $_" } + finally { + if ($PSCmdlet.ParameterSetName -eq 'File') { + # free stream + $stream.Close() + $stream.Dispose() + } - # free algorithm and its resources: - $algorithm.Clear() - $algorithm.Dispose() - } - - # return result for the file - return $result - } -} \ No newline at end of file + # free algorithm and its resources: + $algorithm.Clear() + $algorithm.Dispose() + } #end finally block + + # return result for the file + $result + } #end foreach $item in $Path + } #end process block +} #end Get-PSOneFileHash function