""" Unit tests for document_parser.py """ import pytest from document_parser import DocumentParser, get_file_info class TestDocumentParser: """Test DocumentParser class.""" def setup_method(self): """Set up test fixtures.""" self.parser = DocumentParser() def test_can_parse_supported_extensions(self): """Test can_parse returns True for supported file types.""" supported_files = [ 'document.txt', 'readme.md', 'data.csv', 'config.json', 'report.pdf', 'document.docx', 'spreadsheet.xlsx', 'slides.pptx', 'script.py', 'code.java', 'style.css', 'index.html' ] for filename in supported_files: assert self.parser.can_parse(filename), f"Should parse {filename}" def test_can_parse_unsupported_extensions(self): """Test can_parse returns False for unsupported file types.""" unsupported_files = [ 'image.png', 'video.mp4', 'audio.mp3', 'archive.zip', 'binary.exe', 'document.doc' ] for filename in unsupported_files: assert not self.parser.can_parse(filename), f"Should not parse {filename}" def test_get_extension(self): """Test _get_extension method.""" assert self.parser._get_extension('file.txt') == '.txt' assert self.parser._get_extension('FILE.TXT') == '.txt' assert self.parser._get_extension('archive.tar.gz') == '.gz' assert self.parser._get_extension('noextension') == '' def test_parse_text_utf8(self): """Test parsing UTF-8 text files.""" content = "Hello World\nThis is a test".encode('utf-8') result = self.parser.parse(content, 'test.txt') assert result == "Hello World\nThis is a test" def test_parse_text_multiple_encodings(self): """Test parsing text with different encodings.""" content = "Test content" # UTF-8 result = self.parser._parse_text(content.encode('utf-8')) assert result == "Test content" # Latin-1 result = self.parser._parse_text(content.encode('latin-1')) assert result == "Test content" def test_parse_unsupported_file_raises_error(self): """Test parsing unsupported file type raises ValueError.""" with pytest.raises(ValueError, match="Unsupported file type"): self.parser.parse(b"content", "file.exe") def test_parse_json(self): """Test parsing JSON files.""" content = '{"key": "value", "number": 123}'.encode('utf-8') result = self.parser.parse(content, 'data.json') assert '"key": "value"' in result assert '"number": 123' in result def test_parse_csv(self): """Test parsing CSV files.""" content = "name,age,city\nAlice,30,NYC\nBob,25,LA".encode('utf-8') result = self.parser.parse(content, 'data.csv') assert "Alice" in result assert "30" in result assert "NYC" in result class TestGetFileInfo: """Test get_file_info function.""" def test_document_category(self): """Test document file type categorization.""" info = get_file_info('report.pdf', 1024) assert info['category'] == 'document' assert info['extension'] == '.pdf' def test_spreadsheet_category(self): """Test spreadsheet file type categorization.""" info = get_file_info('data.xlsx', 2048) assert info['category'] == 'spreadsheet' assert info['extension'] == '.xlsx' def test_presentation_category(self): """Test presentation file type categorization.""" info = get_file_info('slides.pptx', 4096) assert info['category'] == 'presentation' def test_code_category(self): """Test code file type categorization.""" info = get_file_info('script.py', 512) assert info['category'] == 'code' def test_image_category(self): """Test image file type categorization.""" info = get_file_info('photo.jpg', 8192) assert info['category'] == 'image' def test_file_size_bytes(self): """Test file size formatting in bytes.""" info = get_file_info('small.txt', 512) assert info['size_formatted'] == '512 B' assert info['size_bytes'] == 512 def test_file_size_kilobytes(self): """Test file size formatting in KB.""" info = get_file_info('medium.txt', 2048) assert 'KB' in info['size_formatted'] assert info['size_bytes'] == 2048 def test_file_size_megabytes(self): """Test file size formatting in MB.""" info = get_file_info('large.pdf', 5 * 1024 * 1024) assert 'MB' in info['size_formatted'] assert '5.0' in info['size_formatted'] def test_file_size_gigabytes(self): """Test file size formatting in GB.""" info = get_file_info('huge.zip', 2 * 1024 * 1024 * 1024) assert 'GB' in info['size_formatted'] assert '2.0' in info['size_formatted'] def test_unknown_extension(self): """Test unknown file extension.""" info = get_file_info('file.xyz', 1024) assert info['category'] == 'file' assert info['extension'] == '.xyz' def test_no_extension(self): """Test file with no extension.""" info = get_file_info('README', 1024) assert info['extension'] == ''