package parser import ( "archive/tar" "archive/zip" "bytes" "compress/gzip" "fmt" "io" "os" "path/filepath" "strings" ) const maxSingleFileSize = 10 * 1024 * 1024 const maxZipArchiveSize = 50 * 1024 * 1024 // ExtractedFile represents a file extracted from archive type ExtractedFile struct { Path string Content []byte } // ExtractArchive extracts tar.gz or zip archive and returns file contents func ExtractArchive(archivePath string) ([]ExtractedFile, error) { ext := strings.ToLower(filepath.Ext(archivePath)) switch ext { case ".gz", ".tgz": return extractTarGz(archivePath) case ".tar": return extractTar(archivePath) case ".zip": return extractZip(archivePath) case ".txt", ".log": return extractSingleFile(archivePath) default: return nil, fmt.Errorf("unsupported archive format: %s", ext) } } // ExtractArchiveFromReader extracts archive from reader func ExtractArchiveFromReader(r io.Reader, filename string) ([]ExtractedFile, error) { ext := strings.ToLower(filepath.Ext(filename)) switch ext { case ".gz", ".tgz": return extractTarGzFromReader(r, filename) case ".tar": return extractTarFromReader(r) case ".zip": return extractZipFromReader(r) case ".txt", ".log": return extractSingleFileFromReader(r, filename) default: return nil, fmt.Errorf("unsupported archive format: %s", ext) } } func extractTarGz(archivePath string) ([]ExtractedFile, error) { f, err := os.Open(archivePath) if err != nil { return nil, fmt.Errorf("open archive: %w", err) } defer f.Close() return extractTarGzFromReader(f, filepath.Base(archivePath)) } func extractTar(archivePath string) ([]ExtractedFile, error) { f, err := os.Open(archivePath) if err != nil { return nil, fmt.Errorf("open archive: %w", err) } defer f.Close() return extractTarFromReader(f) } func extractTarFromReader(r io.Reader) ([]ExtractedFile, error) { tr := tar.NewReader(r) var files []ExtractedFile for { header, err := tr.Next() if err == io.EOF { break } if err != nil { return nil, fmt.Errorf("tar read: %w", err) } // Skip directories if header.Typeflag == tar.TypeDir { continue } // Skip large files (>10MB) if header.Size > 10*1024*1024 { continue } content, err := io.ReadAll(tr) if err != nil { return nil, fmt.Errorf("read file %s: %w", header.Name, err) } files = append(files, ExtractedFile{ Path: header.Name, Content: content, }) } return files, nil } func extractTarGzFromReader(r io.Reader, filename string) ([]ExtractedFile, error) { gzr, err := gzip.NewReader(r) if err != nil { return nil, fmt.Errorf("gzip reader: %w", err) } defer gzr.Close() // Read all decompressed content into buffer // Limit to 50MB for plain gzip files, 10MB per file for tar.gz decompressed, err := io.ReadAll(io.LimitReader(gzr, 50*1024*1024)) if err != nil { return nil, fmt.Errorf("read gzip content: %w", err) } // Try to read as tar archive tr := tar.NewReader(bytes.NewReader(decompressed)) var files []ExtractedFile header, err := tr.Next() if err != nil { // Not a tar archive - treat as a single gzipped file if strings.Contains(err.Error(), "invalid tar header") || err == io.EOF { // Get base filename without .gz extension baseName := strings.TrimSuffix(filename, ".gz") if gzr.Name != "" { baseName = gzr.Name } return []ExtractedFile{ { Path: baseName, Content: decompressed, }, }, nil } return nil, fmt.Errorf("tar read: %w", err) } // It's a valid tar archive, process it for { // Skip directories if header.Typeflag != tar.TypeDir { // Skip large files (>10MB) if header.Size <= 10*1024*1024 { content, err := io.ReadAll(tr) if err != nil { return nil, fmt.Errorf("read file %s: %w", header.Name, err) } files = append(files, ExtractedFile{ Path: header.Name, Content: content, }) } } // Read next header header, err = tr.Next() if err == io.EOF { break } if err != nil { return nil, fmt.Errorf("tar read: %w", err) } } return files, nil } func extractZip(archivePath string) ([]ExtractedFile, error) { r, err := zip.OpenReader(archivePath) if err != nil { return nil, fmt.Errorf("open zip: %w", err) } defer r.Close() var files []ExtractedFile for _, f := range r.File { if f.FileInfo().IsDir() { continue } // Skip large files (>10MB) if f.FileInfo().Size() > 10*1024*1024 { continue } rc, err := f.Open() if err != nil { return nil, fmt.Errorf("open file %s: %w", f.Name, err) } content, err := io.ReadAll(rc) rc.Close() if err != nil { return nil, fmt.Errorf("read file %s: %w", f.Name, err) } files = append(files, ExtractedFile{ Path: f.Name, Content: content, }) } return files, nil } func extractZipFromReader(r io.Reader) ([]ExtractedFile, error) { // Read all data into memory with a hard cap data, err := io.ReadAll(io.LimitReader(r, maxZipArchiveSize+1)) if err != nil { return nil, fmt.Errorf("read zip data: %w", err) } if len(data) > maxZipArchiveSize { return nil, fmt.Errorf("zip too large: max %d bytes", maxZipArchiveSize) } // Create a ReaderAt from the byte slice readerAt := bytes.NewReader(data) // Open the zip archive zipReader, err := zip.NewReader(readerAt, int64(len(data))) if err != nil { return nil, fmt.Errorf("open zip: %w", err) } var files []ExtractedFile for _, f := range zipReader.File { if f.FileInfo().IsDir() { continue } // Skip large files (>10MB) if f.FileInfo().Size() > 10*1024*1024 { continue } rc, err := f.Open() if err != nil { return nil, fmt.Errorf("open file %s: %w", f.Name, err) } content, err := io.ReadAll(rc) rc.Close() if err != nil { return nil, fmt.Errorf("read file %s: %w", f.Name, err) } files = append(files, ExtractedFile{ Path: f.Name, Content: content, }) } return files, nil } func extractSingleFile(path string) ([]ExtractedFile, error) { f, err := os.Open(path) if err != nil { return nil, fmt.Errorf("open file: %w", err) } defer f.Close() return extractSingleFileFromReader(f, filepath.Base(path)) } func extractSingleFileFromReader(r io.Reader, filename string) ([]ExtractedFile, error) { content, err := io.ReadAll(io.LimitReader(r, maxSingleFileSize+1)) if err != nil { return nil, fmt.Errorf("read file content: %w", err) } if len(content) > maxSingleFileSize { return nil, fmt.Errorf("file too large: max %d bytes", maxSingleFileSize) } return []ExtractedFile{ { Path: filepath.Base(filename), Content: content, }, }, nil } // FindFileByPattern finds files matching pattern in extracted files func FindFileByPattern(files []ExtractedFile, patterns ...string) []ExtractedFile { var result []ExtractedFile for _, f := range files { for _, pattern := range patterns { if strings.Contains(strings.ToLower(f.Path), strings.ToLower(pattern)) { result = append(result, f) break } } } return result } // FindFileByName finds file by exact name (case-insensitive) func FindFileByName(files []ExtractedFile, name string) *ExtractedFile { for _, f := range files { if strings.EqualFold(filepath.Base(f.Path), name) { return &f } } return nil }