326 lines
7.1 KiB
Go
326 lines
7.1 KiB
Go
package parser
|
|
|
|
import (
|
|
"archive/tar"
|
|
"archive/zip"
|
|
"bytes"
|
|
"compress/gzip"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
const maxSingleFileSize = 10 * 1024 * 1024
|
|
const maxZipArchiveSize = 50 * 1024 * 1024
|
|
|
|
// ExtractedFile represents a file extracted from archive
|
|
type ExtractedFile struct {
|
|
Path string
|
|
Content []byte
|
|
}
|
|
|
|
// ExtractArchive extracts tar.gz or zip archive and returns file contents
|
|
func ExtractArchive(archivePath string) ([]ExtractedFile, error) {
|
|
ext := strings.ToLower(filepath.Ext(archivePath))
|
|
|
|
switch ext {
|
|
case ".gz", ".tgz":
|
|
return extractTarGz(archivePath)
|
|
case ".tar":
|
|
return extractTar(archivePath)
|
|
case ".zip":
|
|
return extractZip(archivePath)
|
|
case ".txt", ".log":
|
|
return extractSingleFile(archivePath)
|
|
default:
|
|
return nil, fmt.Errorf("unsupported archive format: %s", ext)
|
|
}
|
|
}
|
|
|
|
// ExtractArchiveFromReader extracts archive from reader
|
|
func ExtractArchiveFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
|
|
ext := strings.ToLower(filepath.Ext(filename))
|
|
|
|
switch ext {
|
|
case ".gz", ".tgz":
|
|
return extractTarGzFromReader(r, filename)
|
|
case ".tar":
|
|
return extractTarFromReader(r)
|
|
case ".zip":
|
|
return extractZipFromReader(r)
|
|
case ".txt", ".log":
|
|
return extractSingleFileFromReader(r, filename)
|
|
default:
|
|
return nil, fmt.Errorf("unsupported archive format: %s", ext)
|
|
}
|
|
}
|
|
|
|
func extractTarGz(archivePath string) ([]ExtractedFile, error) {
|
|
f, err := os.Open(archivePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open archive: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
return extractTarGzFromReader(f, filepath.Base(archivePath))
|
|
}
|
|
|
|
func extractTar(archivePath string) ([]ExtractedFile, error) {
|
|
f, err := os.Open(archivePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open archive: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
return extractTarFromReader(f)
|
|
}
|
|
|
|
func extractTarFromReader(r io.Reader) ([]ExtractedFile, error) {
|
|
tr := tar.NewReader(r)
|
|
var files []ExtractedFile
|
|
|
|
for {
|
|
header, err := tr.Next()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("tar read: %w", err)
|
|
}
|
|
|
|
// Skip directories
|
|
if header.Typeflag == tar.TypeDir {
|
|
continue
|
|
}
|
|
|
|
// Skip large files (>10MB)
|
|
if header.Size > 10*1024*1024 {
|
|
continue
|
|
}
|
|
|
|
content, err := io.ReadAll(tr)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
|
|
}
|
|
|
|
files = append(files, ExtractedFile{
|
|
Path: header.Name,
|
|
Content: content,
|
|
})
|
|
}
|
|
|
|
return files, nil
|
|
}
|
|
|
|
func extractTarGzFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
|
|
gzr, err := gzip.NewReader(r)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("gzip reader: %w", err)
|
|
}
|
|
defer gzr.Close()
|
|
|
|
// Read all decompressed content into buffer
|
|
// Limit to 50MB for plain gzip files, 10MB per file for tar.gz
|
|
decompressed, err := io.ReadAll(io.LimitReader(gzr, 50*1024*1024))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read gzip content: %w", err)
|
|
}
|
|
|
|
// Try to read as tar archive
|
|
tr := tar.NewReader(bytes.NewReader(decompressed))
|
|
var files []ExtractedFile
|
|
|
|
header, err := tr.Next()
|
|
if err != nil {
|
|
// Not a tar archive - treat as a single gzipped file
|
|
if strings.Contains(err.Error(), "invalid tar header") || err == io.EOF {
|
|
// Get base filename without .gz extension
|
|
baseName := strings.TrimSuffix(filename, ".gz")
|
|
if gzr.Name != "" {
|
|
baseName = gzr.Name
|
|
}
|
|
|
|
return []ExtractedFile{
|
|
{
|
|
Path: baseName,
|
|
Content: decompressed,
|
|
},
|
|
}, nil
|
|
}
|
|
return nil, fmt.Errorf("tar read: %w", err)
|
|
}
|
|
|
|
// It's a valid tar archive, process it
|
|
for {
|
|
// Skip directories
|
|
if header.Typeflag != tar.TypeDir {
|
|
// Skip large files (>10MB)
|
|
if header.Size <= 10*1024*1024 {
|
|
content, err := io.ReadAll(tr)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
|
|
}
|
|
|
|
files = append(files, ExtractedFile{
|
|
Path: header.Name,
|
|
Content: content,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Read next header
|
|
header, err = tr.Next()
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("tar read: %w", err)
|
|
}
|
|
}
|
|
|
|
return files, nil
|
|
}
|
|
|
|
func extractZip(archivePath string) ([]ExtractedFile, error) {
|
|
r, err := zip.OpenReader(archivePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open zip: %w", err)
|
|
}
|
|
defer r.Close()
|
|
|
|
var files []ExtractedFile
|
|
|
|
for _, f := range r.File {
|
|
if f.FileInfo().IsDir() {
|
|
continue
|
|
}
|
|
|
|
// Skip large files (>10MB)
|
|
if f.FileInfo().Size() > 10*1024*1024 {
|
|
continue
|
|
}
|
|
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open file %s: %w", f.Name, err)
|
|
}
|
|
|
|
content, err := io.ReadAll(rc)
|
|
rc.Close()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read file %s: %w", f.Name, err)
|
|
}
|
|
|
|
files = append(files, ExtractedFile{
|
|
Path: f.Name,
|
|
Content: content,
|
|
})
|
|
}
|
|
|
|
return files, nil
|
|
}
|
|
|
|
func extractZipFromReader(r io.Reader) ([]ExtractedFile, error) {
|
|
// Read all data into memory with a hard cap
|
|
data, err := io.ReadAll(io.LimitReader(r, maxZipArchiveSize+1))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read zip data: %w", err)
|
|
}
|
|
if len(data) > maxZipArchiveSize {
|
|
return nil, fmt.Errorf("zip too large: max %d bytes", maxZipArchiveSize)
|
|
}
|
|
|
|
// Create a ReaderAt from the byte slice
|
|
readerAt := bytes.NewReader(data)
|
|
|
|
// Open the zip archive
|
|
zipReader, err := zip.NewReader(readerAt, int64(len(data)))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open zip: %w", err)
|
|
}
|
|
|
|
var files []ExtractedFile
|
|
|
|
for _, f := range zipReader.File {
|
|
if f.FileInfo().IsDir() {
|
|
continue
|
|
}
|
|
|
|
// Skip large files (>10MB)
|
|
if f.FileInfo().Size() > 10*1024*1024 {
|
|
continue
|
|
}
|
|
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open file %s: %w", f.Name, err)
|
|
}
|
|
|
|
content, err := io.ReadAll(rc)
|
|
rc.Close()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read file %s: %w", f.Name, err)
|
|
}
|
|
|
|
files = append(files, ExtractedFile{
|
|
Path: f.Name,
|
|
Content: content,
|
|
})
|
|
}
|
|
|
|
return files, nil
|
|
}
|
|
|
|
func extractSingleFile(path string) ([]ExtractedFile, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("open file: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
return extractSingleFileFromReader(f, filepath.Base(path))
|
|
}
|
|
|
|
func extractSingleFileFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
|
|
content, err := io.ReadAll(io.LimitReader(r, maxSingleFileSize+1))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read file content: %w", err)
|
|
}
|
|
if len(content) > maxSingleFileSize {
|
|
return nil, fmt.Errorf("file too large: max %d bytes", maxSingleFileSize)
|
|
}
|
|
|
|
return []ExtractedFile{
|
|
{
|
|
Path: filepath.Base(filename),
|
|
Content: content,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
// FindFileByPattern finds files matching pattern in extracted files
|
|
func FindFileByPattern(files []ExtractedFile, patterns ...string) []ExtractedFile {
|
|
var result []ExtractedFile
|
|
for _, f := range files {
|
|
for _, pattern := range patterns {
|
|
if strings.Contains(strings.ToLower(f.Path), strings.ToLower(pattern)) {
|
|
result = append(result, f)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// FindFileByName finds file by exact name (case-insensitive)
|
|
func FindFileByName(files []ExtractedFile, name string) *ExtractedFile {
|
|
for _, f := range files {
|
|
if strings.EqualFold(filepath.Base(f.Path), name) {
|
|
return &f
|
|
}
|
|
}
|
|
return nil
|
|
}
|