Skip to content

Commit a5fdeb0

Browse files
committed
feat: scan source distributions for compiled code
The new helper function `scan_compiled_extensions` scans source distributions for compiled code. It detects common extensions like `.so` and `.dylib` as well as files with certain headers. The function is designed to detect packaging issues like sdists with pre-compiled code. It is incapable of detecting supply chain attacks and malicious code. Signed-off-by: Christian Heimes <cheimes@redhat.com>
1 parent 5b096c0 commit a5fdeb0

2 files changed

Lines changed: 115 additions & 0 deletions

File tree

src/fromager/sources.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,9 @@ def build_sdist(
608608
sdist_root_dir=sdist_root_dir,
609609
build_env=build_env,
610610
)
611+
# look for compiled code in sdist
612+
scan_compiled_extensions(sdist_root_dir)
613+
611614
if req.url:
612615
# The default approach to making an sdist is to make a tarball from the
613616
# source directory, since most of the time we got the source directory
@@ -776,3 +779,85 @@ def validate_sdist_filename(
776779
dist_name=sdist_name,
777780
dist_version=sdist_version,
778781
)
782+
783+
784+
_EXTENSION_SUFFIXES: set[str] = (
785+
".so", # Linux, BSD
786+
".dylib", # macOS
787+
".pyd", # Windows
788+
".dll", # Windows
789+
".exe", # Windows
790+
)
791+
792+
# ignore Python, configs, C, C++, CUDA, Go, ROCm/hip, Rust, text files
793+
_IGNORE_SUFFIXES: set[str] = {
794+
".c",
795+
".cc",
796+
".cu",
797+
".cuh",
798+
".go",
799+
".h",
800+
".hip",
801+
".hpp",
802+
".ini",
803+
".md",
804+
".py",
805+
".rs",
806+
".rst",
807+
".sh",
808+
".tml",
809+
".toml",
810+
".txt",
811+
".yaml",
812+
".yml",
813+
}
814+
815+
_MAGIC_HEADERS: tuple[bytes, ...] = (
816+
b"\x7fELF", # Linux, BSD ELF
817+
# b"MZ", # Windows executable (usually have dll, pyd, or exe file suffix)
818+
b"\xfe\xed\xfa\xcf", # macOS 64-bit
819+
b"\xfe\xed\xfa\xce", # macOS 32-bit
820+
b"\xca\xfe\xba\xbe", # macOS universal
821+
)
822+
823+
824+
def scan_compiled_extensions(
825+
root_dir: pathlib.Path,
826+
*,
827+
extension_suffixes: set[str] = _EXTENSION_SUFFIXES,
828+
ignore_suffixes: set[str] = _IGNORE_SUFFIXES,
829+
warn: bool = True,
830+
) -> list[pathlib.Path]:
831+
"""Scan directory tree for compiled code
832+
833+
Detect files that have an extension suffix or magic header.
834+
835+
.. warning::
836+
837+
The function is not designed to detect supply chain attacks or
838+
malicious code. It's merely a helper to detect packaging issues.
839+
"""
840+
issues: list[pathlib.Path] = []
841+
for directory, _, filenames in root_dir.walk():
842+
for filename in filenames:
843+
filepath = directory / filename
844+
suffix = filepath.suffix
845+
if suffix in extension_suffixes:
846+
if warn:
847+
logger.warning(
848+
"file %s has a binary extension suffix",
849+
filepath.relative_to(root_dir),
850+
)
851+
issues.append(filepath)
852+
elif suffix not in ignore_suffixes:
853+
with filepath.open("rb") as f:
854+
header = f.read(4)
855+
if header.startswith(_MAGIC_HEADERS):
856+
if warn:
857+
logger.warning(
858+
"file %s starts with an executable file magic header: %r",
859+
filepath.relative_to(root_dir),
860+
header,
861+
)
862+
issues.append(filepath)
863+
return issues

tests/test_sources.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
import sys
23
import typing
34
from unittest.mock import Mock, patch
45

@@ -275,3 +276,32 @@ def test_validate_sdist_file(
275276
else:
276277
with pytest.raises(ValueError):
277278
sources.validate_sdist_filename(req, version, sdist_file)
279+
280+
281+
# read header of Python executable
282+
with open(sys.executable, "rb") as _f:
283+
_EXEC_HEADER = _f.read(8)
284+
285+
286+
@pytest.mark.parametrize(
287+
"filename,content,hit",
288+
[
289+
("test.py", b"#!/usr/bin/python", False),
290+
("test.so", b"ignore", True),
291+
("test", _EXEC_HEADER, True),
292+
# assume that packages do not disguise compiled code as .py files.
293+
# A malicious actor can use more elaborate tricks to hide bad code.
294+
("test.py", _EXEC_HEADER, False),
295+
],
296+
)
297+
def test_scan_compiled_extensions(
298+
filename: str, content: bytes, hit: bool, tmp_path: pathlib.Path
299+
) -> None:
300+
filepath = tmp_path / filename
301+
with filepath.open("wb") as f:
302+
f.write(content)
303+
matches = sources.scan_compiled_extensions(tmp_path)
304+
if hit:
305+
assert matches == [filepath]
306+
else:
307+
assert matches == []

0 commit comments

Comments
 (0)