Files
logpile/internal/parser/archive.go

326 lines
7.1 KiB
Go

package parser
import (
"archive/tar"
"archive/zip"
"bytes"
"compress/gzip"
"fmt"
"io"
"os"
"path/filepath"
"strings"
)
const maxSingleFileSize = 10 * 1024 * 1024
const maxZipArchiveSize = 50 * 1024 * 1024
// ExtractedFile represents a file extracted from archive
type ExtractedFile struct {
Path string
Content []byte
}
// ExtractArchive extracts tar.gz or zip archive and returns file contents
func ExtractArchive(archivePath string) ([]ExtractedFile, error) {
ext := strings.ToLower(filepath.Ext(archivePath))
switch ext {
case ".gz", ".tgz":
return extractTarGz(archivePath)
case ".tar":
return extractTar(archivePath)
case ".zip":
return extractZip(archivePath)
case ".txt", ".log":
return extractSingleFile(archivePath)
default:
return nil, fmt.Errorf("unsupported archive format: %s", ext)
}
}
// ExtractArchiveFromReader extracts archive from reader
func ExtractArchiveFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
ext := strings.ToLower(filepath.Ext(filename))
switch ext {
case ".gz", ".tgz":
return extractTarGzFromReader(r, filename)
case ".tar":
return extractTarFromReader(r)
case ".zip":
return extractZipFromReader(r)
case ".txt", ".log":
return extractSingleFileFromReader(r, filename)
default:
return nil, fmt.Errorf("unsupported archive format: %s", ext)
}
}
func extractTarGz(archivePath string) ([]ExtractedFile, error) {
f, err := os.Open(archivePath)
if err != nil {
return nil, fmt.Errorf("open archive: %w", err)
}
defer f.Close()
return extractTarGzFromReader(f, filepath.Base(archivePath))
}
func extractTar(archivePath string) ([]ExtractedFile, error) {
f, err := os.Open(archivePath)
if err != nil {
return nil, fmt.Errorf("open archive: %w", err)
}
defer f.Close()
return extractTarFromReader(f)
}
func extractTarFromReader(r io.Reader) ([]ExtractedFile, error) {
tr := tar.NewReader(r)
var files []ExtractedFile
for {
header, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("tar read: %w", err)
}
// Skip directories
if header.Typeflag == tar.TypeDir {
continue
}
// Skip large files (>10MB)
if header.Size > 10*1024*1024 {
continue
}
content, err := io.ReadAll(tr)
if err != nil {
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
}
files = append(files, ExtractedFile{
Path: header.Name,
Content: content,
})
}
return files, nil
}
func extractTarGzFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
gzr, err := gzip.NewReader(r)
if err != nil {
return nil, fmt.Errorf("gzip reader: %w", err)
}
defer gzr.Close()
// Read all decompressed content into buffer
// Limit to 50MB for plain gzip files, 10MB per file for tar.gz
decompressed, err := io.ReadAll(io.LimitReader(gzr, 50*1024*1024))
if err != nil {
return nil, fmt.Errorf("read gzip content: %w", err)
}
// Try to read as tar archive
tr := tar.NewReader(bytes.NewReader(decompressed))
var files []ExtractedFile
header, err := tr.Next()
if err != nil {
// Not a tar archive - treat as a single gzipped file
if strings.Contains(err.Error(), "invalid tar header") || err == io.EOF {
// Get base filename without .gz extension
baseName := strings.TrimSuffix(filename, ".gz")
if gzr.Name != "" {
baseName = gzr.Name
}
return []ExtractedFile{
{
Path: baseName,
Content: decompressed,
},
}, nil
}
return nil, fmt.Errorf("tar read: %w", err)
}
// It's a valid tar archive, process it
for {
// Skip directories
if header.Typeflag != tar.TypeDir {
// Skip large files (>10MB)
if header.Size <= 10*1024*1024 {
content, err := io.ReadAll(tr)
if err != nil {
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
}
files = append(files, ExtractedFile{
Path: header.Name,
Content: content,
})
}
}
// Read next header
header, err = tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("tar read: %w", err)
}
}
return files, nil
}
func extractZip(archivePath string) ([]ExtractedFile, error) {
r, err := zip.OpenReader(archivePath)
if err != nil {
return nil, fmt.Errorf("open zip: %w", err)
}
defer r.Close()
var files []ExtractedFile
for _, f := range r.File {
if f.FileInfo().IsDir() {
continue
}
// Skip large files (>10MB)
if f.FileInfo().Size() > 10*1024*1024 {
continue
}
rc, err := f.Open()
if err != nil {
return nil, fmt.Errorf("open file %s: %w", f.Name, err)
}
content, err := io.ReadAll(rc)
rc.Close()
if err != nil {
return nil, fmt.Errorf("read file %s: %w", f.Name, err)
}
files = append(files, ExtractedFile{
Path: f.Name,
Content: content,
})
}
return files, nil
}
func extractZipFromReader(r io.Reader) ([]ExtractedFile, error) {
// Read all data into memory with a hard cap
data, err := io.ReadAll(io.LimitReader(r, maxZipArchiveSize+1))
if err != nil {
return nil, fmt.Errorf("read zip data: %w", err)
}
if len(data) > maxZipArchiveSize {
return nil, fmt.Errorf("zip too large: max %d bytes", maxZipArchiveSize)
}
// Create a ReaderAt from the byte slice
readerAt := bytes.NewReader(data)
// Open the zip archive
zipReader, err := zip.NewReader(readerAt, int64(len(data)))
if err != nil {
return nil, fmt.Errorf("open zip: %w", err)
}
var files []ExtractedFile
for _, f := range zipReader.File {
if f.FileInfo().IsDir() {
continue
}
// Skip large files (>10MB)
if f.FileInfo().Size() > 10*1024*1024 {
continue
}
rc, err := f.Open()
if err != nil {
return nil, fmt.Errorf("open file %s: %w", f.Name, err)
}
content, err := io.ReadAll(rc)
rc.Close()
if err != nil {
return nil, fmt.Errorf("read file %s: %w", f.Name, err)
}
files = append(files, ExtractedFile{
Path: f.Name,
Content: content,
})
}
return files, nil
}
func extractSingleFile(path string) ([]ExtractedFile, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file: %w", err)
}
defer f.Close()
return extractSingleFileFromReader(f, filepath.Base(path))
}
func extractSingleFileFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
content, err := io.ReadAll(io.LimitReader(r, maxSingleFileSize+1))
if err != nil {
return nil, fmt.Errorf("read file content: %w", err)
}
if len(content) > maxSingleFileSize {
return nil, fmt.Errorf("file too large: max %d bytes", maxSingleFileSize)
}
return []ExtractedFile{
{
Path: filepath.Base(filename),
Content: content,
},
}, nil
}
// FindFileByPattern finds files matching pattern in extracted files
func FindFileByPattern(files []ExtractedFile, patterns ...string) []ExtractedFile {
var result []ExtractedFile
for _, f := range files {
for _, pattern := range patterns {
if strings.Contains(strings.ToLower(f.Path), strings.ToLower(pattern)) {
result = append(result, f)
break
}
}
}
return result
}
// FindFileByName finds file by exact name (case-insensitive)
func FindFileByName(files []ExtractedFile, name string) *ExtractedFile {
for _, f := range files {
if strings.EqualFold(filepath.Base(f.Path), name) {
return &f
}
}
return nil
}